1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <errno.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <fcntl.h>
26 #include <signal.h>
27 #ifdef HAVE_LOCALE_H
28 #include <locale.h>
29 #endif
30
31 #ifdef WIN32
32 #include <process.h>
33 #endif
34
35 #ifdef HAVE_READLINE
36 #include <readline/readline.h>
37 #include <readline/history.h>
38 #endif
39
40 #ifdef CHASEN
41 #include <chasen.h>
42 #endif
43
44 #ifdef MECAB
45 #include <mecab.h>
46 #endif
47
48 #include "udmsearch.h"
49 #include "udm_sqldbms.h" /* TODO34: Remove this */
50 #include "udm_db_int.h" /* TODO34: Remove this */
51 #include "udm_query.h" /* TODO34: Remove this */
52 #include "udm_http.h"
53
54 /* This should be last include */
55 #ifdef DMALLOC
56 #include "dmalloc.h"
57 #endif
58
59 static int loglevel= UDM_LOG_INFO;
60 static char cname[1024]= "";
61
62 static UDM_ENV Conf;
63
64 extern unsigned int seconds; /* To sleep between documents */
65 extern int flags; /* For indexer */
66 extern int total_threads; /* Total threads number */
67 extern int sleep_threads; /* Number of sleepping threads */
68 extern int max_index_time;
69 extern int maxthreads;
70 extern UDM_CRAWLER *ThreadCrawlers;
71 extern int thd_errors;
72
73 #ifdef HAVE_PTHREAD
74 static udm_thread_t *threads= NULL;
75
76 #ifdef WIN32
77 #include <time.h>
78 static struct tm*
localtime_r(const time_t * clock,struct tm * result)79 localtime_r(const time_t *clock, struct tm *result)
80 {
81 *result= *localtime(clock);
82 return result;
83 }
84 #define strptime(x, y, z) (0)
85 #endif /* WIN32 */
86 #endif /* HAVE_PTHREAD */
87
88
89 /* CallBack function for log information */
90 #ifdef WIN32
UdmShowInfo(UDM_AGENT * A,const char * state,const char * str)91 void UdmShowInfo(UDM_AGENT* A, const char *state, const char* str)
92 {
93 printf("%d %s %s\n", A ? A->handle : 0,state,str);
94 }
95 #else
96 extern UDM_API(void) UdmShowInfo(UDM_AGENT* A, const char *state, const char* str);
97 #endif /* WIN32 */
98
99
100 static const char *execsql= NULL;
101 static char *
sqlexecgets(UDM_IOHANDLER * iohandler,char * str,size_t len)102 sqlexecgets(UDM_IOHANDLER *iohandler, char *str, size_t len)
103 {
104 UDM_SQLMON_PARAM *prm= (UDM_SQLMON_PARAM*) iohandler->user_data;
105 if (!execsql)
106 return NULL;
107 udm_snprintf(str, len, "%s", execsql);
108 prm->flags|= UDM_SQLMON_DONT_NEED_SEMICOLON;
109 execsql= NULL;
110 return str;
111 }
112
113
sqlmongets(UDM_IOHANDLER * iohandler,char * str,size_t size)114 static char* sqlmongets(UDM_IOHANDLER *iohandler, char *str, size_t size)
115 {
116 UDM_SQLMON_PARAM *prm= (UDM_SQLMON_PARAM*) iohandler->user_data;
117 #ifdef HAVE_READLINE
118 if (isatty(0))
119 {
120 char prompt[]="SQL>";
121 char *line= readline(prompt);
122 if (!line)
123 return 0;
124
125 if (*line) add_history(line);
126 /* We need "\n" at the end to make sqlmon work properly */
127 udm_snprintf(str, size, "%s\n", line);
128 }
129 else
130 #endif
131 {
132 if (loglevel >= UDM_LOG_INFO)
133 prm->iohandler.prompt(&prm->iohandler, UDM_MSG_PROMPT, "SQL>");
134 if (!fgets(str, size, stdin))
135 return 0;
136 }
137 return str;
138 }
139
140
141 static udm_rc_t
sqlmonprompt(UDM_IOHANDLER * iohandler,udm_msg_t msgtype,const char * msg)142 sqlmonprompt(UDM_IOHANDLER *iohandler, udm_msg_t msgtype, const char *msg)
143 {
144 UDM_SQLMON_PARAM *prm= (UDM_SQLMON_PARAM*) iohandler->user_data;
145 switch (msgtype)
146 {
147 case UDM_MSG_ERROR:
148 if (prm->mode == udm_sqlmon_mode_batch)
149 fprintf(stdout, "ERROR at line %d: %s\n", (int) prm->lineno + 1, msg);
150 else
151 fprintf(stdout, "ERROR: %s\n", msg);
152 break;
153 case UDM_MSG_INFO:
154 if (loglevel >= UDM_LOG_INFO)
155 fprintf(stdout, "%s\n", msg);
156 break;
157 case UDM_MSG_COMMAND:
158 if (loglevel >= UDM_LOG_INFO)
159 fprintf(stdout, "'%s'\n", msg);
160 break;
161 case UDM_MSG_EOL:
162 fprintf(stdout, "\n");
163 break;
164 case UDM_MSG_DATA:
165 case UDM_MSG_PROMPT:
166 fprintf(stdout, "%s", msg);
167 break;
168 }
169 return UDM_OK;
170 }
171
172
173 static udm_rc_t
UdmIndCreate(UDM_AGENT * Agent)174 UdmIndCreate(UDM_AGENT *Agent)
175 {
176 udm_rc_t rc;
177 if (UDM_OK != (rc= UdmDBAction(Agent, UDM_DBCMD_CREATE)))
178 UdmLog(Agent, UDM_LOG_ERROR, "Error: '%s'", UdmEnvErrMsg(Agent->Conf));
179 return rc;
180 }
181
182
183 static udm_rc_t
UdmIndDrop(UDM_AGENT * Agent)184 UdmIndDrop(UDM_AGENT *Agent)
185 {
186 udm_rc_t rc;
187 if (UDM_OK != (rc= UdmDBAction(Agent, UDM_DBCMD_DROP)))
188 UdmLog(Agent, UDM_LOG_ERROR, "Error: '%s'", UdmEnvErrMsg(Agent->Conf));
189 return rc;
190 }
191
192
193 static udm_rc_t
ShowStatistics(UDM_AGENT * Indexer,UDM_QUERY * Query)194 ShowStatistics(UDM_AGENT *Indexer, UDM_QUERY *Query)
195 {
196 udm_rc_t rc;
197 struct tm tm;
198 const char *stat_time;
199 char sbuf[32];
200 size_t snum;
201 UDM_STAT Total;
202
203 bzero((void*)&Total, sizeof(Total));
204 Query->StatList.time= time(NULL);
205 stat_time = UdmVarListFindStr(&Conf.Vars, "stat_time", "0");
206 bzero(&tm, sizeof(tm));
207
208 #ifndef WIN32
209 if (stat_time &&
210 strlen(stat_time) >= 7 &&
211 stat_time[4] == '-' &&
212 (stat_time[7] == '-' || !stat_time[7]) &&
213 (strptime(stat_time, "%Y-%m-%d %H:%M:%S", &tm) ||
214 strptime(stat_time, "%Y-%m-%d %H:%M", &tm) ||
215 strptime(stat_time, "%Y-%m-%d %H:%M", &tm) ||
216 strptime(stat_time, "%Y-%m-%d %H", &tm) ||
217 strptime(stat_time, "%Y-%m-%d", &tm) ||
218 strptime(stat_time, "%Y-%m", &tm)))
219 {
220 Query->StatList.time = mktime(&tm);
221 }
222 else if (stat_time && (Query->StatList.time= Udm_dp2time_t(stat_time)) >= 0)
223 {
224 Query->StatList.time += time(NULL);
225 localtime_r(&Query->StatList.time, &tm);
226 }
227 else
228 {
229 Query->StatList.time = time(NULL);
230 localtime_r(&Query->StatList.time, &tm);
231 }
232 #else
233 {
234 struct tm *tm1;
235 Query->StatList.time= time(NULL);
236 tm= *(tm1= localtime(&Query->StatList.time));
237 }
238 #endif
239 if (UDM_OK != (rc= UdmQueryAction(Indexer, Query, UDM_QUERYCMD_STATISTICS)))
240 {
241 UdmLog(Indexer, UDM_LOG_ERROR, "Error: '%s'", UdmEnvErrMsg(Indexer->Conf));
242 goto ex;
243 }
244
245 strftime(sbuf, sizeof(sbuf), "%Y-%m-%d %H:%M:%S", &tm);
246 printf("\n Database statistics [%s]\n\n", sbuf);
247 printf("%10s %10s %10s\n","Status","Expired","Total");
248 printf(" -----------------------------\n");
249 for (snum= 0; snum < Query->StatList.nstats; snum++)
250 {
251 UDM_STAT *S= &Query->StatList.Stat[snum];
252 printf("%10d %10d %10d %s\n",S->status,S->expired,S->total,UdmHTTPErrMsg(S->status));
253 Total.expired+=S->expired;
254 Total.total+=S->total;
255 }
256 printf(" -----------------------------\n");
257 printf("%10s %10d %10d\n","Total",Total.expired,Total.total);
258 printf("\n");
259
260 ex:
261 return rc;
262 }
263
264 /* CallBack Func for Referers*/
UdmRefProc(int code,const char * url,const char * ref)265 static void UdmRefProc(int code, const char *url, const char * ref)
266 {
267 printf("%d %s %s\n",code,url,ref);
268 }
269
270 static udm_rc_t
ShowReferers(UDM_AGENT * Indexer,UDM_QUERY * Query)271 ShowReferers(UDM_AGENT *Indexer, UDM_QUERY *Query)
272 {
273 return UdmQueryAction(Indexer, Query, UDM_QUERYCMD_REFERERS);
274 }
275
276
277 #undef THINFO_TEST
278 #ifdef THINFO_TEST
279 /* CallBack function for Thread information */
UdmShowThreadInfoProc(int handle,char * state,char * str)280 void UdmShowThreadInfoProc(int handle,char *state, char* str)
281 {
282 printf("%d %s %s\n",handle,state,str);
283 }
284 #endif
285
286
cmpgrp(const void * v1,const void * v2)287 static int cmpgrp(const void *v1, const void *v2)
288 {
289 int res;
290 UDM_CHARSET *c1= (UDM_CHARSET*) v1;
291 UDM_CHARSET *c2= (UDM_CHARSET*) v2;
292 if ((res = strcasecmp(UdmCsGroup(c1), UdmCsGroup(c2))))
293 return res;
294 return strcasecmp(c1->name,c2->name);
295 }
296
display_charsets(FILE * file)297 static void display_charsets(FILE *file)
298 {
299 UDM_CHARSET *cs=NULL;
300 struct udm_cset_st c[100];
301 size_t i=0;
302 size_t n=0;
303 int family=-1;
304
305 for(cs=UdmGetCharSetByID(0) ; cs && cs->name ; cs++)
306 {
307 /* Skip not compiled charsets */
308 if (cs->family != UDM_CHARSET_UNKNOWN)
309 c[n++]=*cs;
310 }
311 fprintf(file,"\n%d charsets available:\n", (int) n);
312
313 UdmSort(c,n,sizeof(UDM_CHARSET),&cmpgrp);
314 for(i=0;i<n;i++)
315 {
316 if (family!=c[i].family)
317 {
318 fprintf(file, "\n%19s : ", UdmCsGroup(&c[i]));
319 family=c[i].family;
320 }
321 fprintf(file,"%s ",c[i].name);
322 }
323 fprintf(file,"\n");
324 }
325
326
usage(int level,UDM_CMDLINE_OPT * options)327 static int usage(int level, UDM_CMDLINE_OPT *options)
328 {
329 FILE *file= stdout;
330 fprintf(file, "\n");
331 fprintf(file, "indexer from %s-%s-%s\n", PACKAGE, VERSION, UDM_DBTYPE);
332 fprintf(file, "http://www.mnogosearch.org/ (C)1998-2015, LavTech Corp.\n");
333 fprintf(file, "\n");
334 fprintf(file, "Usage: indexer [OPTIONS] [configfile]\n");
335
336 UdmCmdLineOptionsPrint(options, stdout);
337
338 fprintf(file, "\n");
339 fprintf(file, "\n");
340 fprintf(file, "Please post bug reports and suggestions at http://www.mnogosearch.org/bugs/\n");
341
342
343 if (level>1)display_charsets(file);
344 return(0);
345 }
346
347
348 /*
349 Load indexer.conf and check if any DBAddr were given
350 */
351 static udm_rc_t
UdmIndexerEnvLoad(UDM_AGENT * Indexer,const char * fname,int lflags)352 UdmIndexerEnvLoad(UDM_AGENT *Indexer, const char *fname,int lflags)
353 {
354 udm_rc_t rc;
355 if (UDM_OK == (rc= UdmEnvLoad(Indexer, fname, lflags)))
356 {
357 if (Indexer->Conf->DBList.nitems == 0)
358 {
359 sprintf(Indexer->Conf->errstr, "Error: '%s': No required DBAddr commands were specified", fname);
360 rc= UDM_ERROR;
361 }
362 }
363 return rc;
364 }
365
366
367
368 /*
369 Parse command line
370 */
371 static int UdmARGC;
372 static char **UdmARGV;
373
374
375
376 typedef struct
377 {
378 const char *url_filename;
379 udm_indcmd_t cmd;
380 udm_bool_t insert;
381 udm_bool_t expire;
382 udm_bool_t block;
383 udm_bool_t help;
384 udm_bool_t log2stderr;
385 udm_bool_t have_loglevel;
386 udm_bool_t warnings;
387
388 int add_servers;
389 int add_server_urls;
390 int load_langmaps;
391 int load_spells;
392 int load_for_dump;
393
394 } UDM_INDEXER_OPTIONS;
395
396
397 static void
UdmIndexerOptionsInit(UDM_INDEXER_OPTIONS * io)398 UdmIndexerOptionsInit(UDM_INDEXER_OPTIONS *io)
399 {
400 io->cmd= UDM_IND_CRAWL;
401 io->insert= UDM_FALSE;
402 io->expire= UDM_FALSE;
403 io->block= UDM_FALSE;
404 io->help= UDM_FALSE;
405 io->url_filename= NULL;
406 io->log2stderr= UDM_TRUE;
407 io->have_loglevel= UDM_FALSE;
408
409 io->add_servers= UDM_FLAG_ADD_SERV;
410 io->add_server_urls= UDM_FLAG_ADD_SERVURL;
411 io->load_langmaps= UDM_FLAG_LOAD_LANGMAP;
412 io->load_spells= UDM_FLAG_SPELL;
413 io->load_for_dump= 0;
414 io->warnings= UDM_TRUE;
415 }
416
417
418 static void
UdmFeaturesPrint(const char * mask)419 UdmFeaturesPrint(const char *mask)
420 {
421 UDM_VARLIST W;
422 size_t i;
423 UdmVarListInit(&W);
424 UdmFeatures(&W);
425 for (i= 0; i < W.nvars; i++)
426 {
427 const UDM_VAR *var= UdmVarListFindConstByIndex(&W, i);
428 if (!UdmWildCaseCmp(UdmVarName(var), mask))
429 {
430 UDM_CONST_STR strbuf, *str= UdmVarGetConstStr(var, &strbuf);
431 printf("%s:%.*s\n", UdmVarName(var), (int) str->length, str->str);
432 }
433 }
434 UdmVarListFree(&W);
435 }
436
437 /*
438 Available new options:
439 Capital letters: B GH JK M OP T VXWXYZ
440 Small letters: k x z
441 Digits: 123456789
442 */
443 static int
UdmCmdLineHandleOption(void * user_data,const UDM_CMDLINE_OPT * opt,const char * value)444 UdmCmdLineHandleOption(void *user_data,
445 const UDM_CMDLINE_OPT *opt, const char *value)
446 {
447 UDM_INDEXER_OPTIONS *io= (UDM_INDEXER_OPTIONS*) user_data;
448 switch (opt->id)
449 {
450 case 'F':
451 {
452 UdmFeaturesPrint(value);
453 exit(0);
454 }
455 case 'C':
456 io->cmd= UDM_IND_DELETE;
457 io->add_servers= io->load_langmaps= io->load_spells=0;
458 break;
459 case 'S':
460 io->cmd= UDM_IND_STAT;
461 io->add_servers= io->load_langmaps= io->load_spells= 0;
462 break;
463 case 'I':
464 io->cmd= UDM_IND_REFERERS;
465 io->add_servers= io->load_langmaps= io->load_spells= 0;
466 break;
467 case 'Q':
468 io->cmd= UDM_IND_SQLMON;
469 io->add_servers= io->load_langmaps= io->load_spells= 0;
470 break;
471 case UDM_IND_INDEX:
472 case UDM_IND_CRAWL:
473 case UDM_IND_CREATE:
474 case UDM_IND_DROP:
475 case UDM_IND_CHECKCONF:
476 case UDM_IND_CONVERT:
477 case UDM_IND_MULTI2BLOB:
478 case UDM_IND_EXPORT:
479 case UDM_IND_WRDSTAT:
480 case UDM_IND_REWRITEURL:
481 case UDM_IND_REWRITEPOP:
482 case UDM_IND_HASHSPELL:
483 case UDM_IND_DUMPSPELL:
484 case UDM_IND_REWRITELIMITS:
485 case UDM_IND_DUMPCONF:
486 case UDM_IND_DUMPDATA:
487 case UDM_IND_RESTOREDATA:
488 io->cmd= (udm_indcmd_t) opt->id;
489 break;
490 case UDM_IND_EXECSQL:
491 io->cmd= UDM_IND_SQLMON;
492 UdmVarListReplaceStr(&Conf.Vars, "exec", value);
493 io->add_servers= io->load_langmaps= io->load_spells= 0;
494 break;
495 case UDM_IND_SET0:
496 UDM_ASSERT(opt->name && opt->name[0]);
497 UdmVarListReplaceStr(&Conf.Vars, opt->name, value);
498 break;
499 case UDM_IND_SET:
500 {
501 const char *eq= strchr(value, '=');
502 char name[80];
503 if (eq && ((size_t)(eq - value)) < sizeof(name))
504 {
505 memcpy(name, value, eq - value);
506 name[eq-value]= '\0';
507 UdmVarListReplaceStr(&Conf.Vars, name, eq + 1);
508 }
509 else
510 UdmVarListReplaceInt(&Conf.Vars, value, 1);
511 }
512 break;
513
514 case 'q':
515 if (io->add_server_urls == 0) /* -q already given */
516 {
517 /*
518 "indexer -qq" is given, do even faster start-up.
519 Don't synchonize "Server" commands in indexer.conf
520 with the "server" table content.
521 */
522 io->add_servers|= UDM_FLAG_DONT_ADD_TO_DB;
523 }
524 io->add_server_urls= 0;
525 break;
526 case 'l': io->log2stderr= UDM_FALSE; break;
527 case 'a': io->expire= UDM_TRUE; break;
528 case 'b': io->block= UDM_TRUE; break;
529 case 'e': flags|=UDM_FLAG_SORT_EXPIRED;break;
530 case 'o': flags|=UDM_FLAG_SORT_HOPS;break;
531 case 'r': flags|=UDM_FLAG_DONTSORT_SEED; break;
532 case 'm': flags|=UDM_FLAG_REINDEX;break;
533 case 'n': Conf.url_number=atoi(value);break;
534 case 'c': max_index_time=atoi(value);break;
535 case 'v': loglevel= atoi(value); io->have_loglevel= UDM_TRUE; break;
536 case 'p': seconds=atoi(value);break;
537 case 't': UdmVarListAddStr(&Conf.Vars,"tag" , value);break;
538 case 's': UdmVarListAddStr(&Conf.Vars, "status", value);break;
539 case 'y': UdmVarListAddStr(&Conf.Vars,"type", value);break;
540 case 'L': UdmVarListAddStr(&Conf.Vars,"lang", value);break;
541 case 'u': UdmVarListAddStr(&Conf.Vars,"u" , value);
542 if (io->insert)
543 {
544 UDM_HREFPARAM HrefParam;
545 UdmHrefParamInit(&HrefParam);
546 HrefParam.link_source= UDM_LINK_SOURCE_CMDLINE;
547 UdmHrefListAddConst(&Conf.Hrefs, &HrefParam, value);
548 }
549 break;
550 case 'N':
551 maxthreads=atoi(value);
552 UdmVarListReplaceInt(&Conf.Vars, "CrawlerThreads", maxthreads);
553 UdmVarListReplaceInt(&Conf.Vars, "IndexerThreads", maxthreads);
554 break;
555 case 'f': io->url_filename= value; break;
556 case 'i': io->insert= UDM_TRUE; break;
557 case 'w': io->warnings= UDM_FALSE; break;
558 case 'j': UdmVarListAddStr(&Conf.Vars, "stat_time", value); break;
559 case 'd': strncpy(cname, value, sizeof(cname));
560 cname[sizeof(cname) - 1] = '\0';
561 break;
562 case 'D': UdmVarListAddStr(&Conf.Vars,"DBLimit" , value); break;
563 case '?':
564 case 'h':
565 default:
566 io->help++;
567 }
568 return 0;
569 }
570
571
572 /*
573 Available new options:
574 Capital letters: AB GH JK M OP TUVXWXYZ
575 Small letters: k x z
576 Digits: 123456789
577 */
578 static UDM_CMDLINE_OPT udm_indexer_options[]=
579 {
580 {-1, "", UDM_OPT_TITLE,NULL, "\nCrawler options:"},
581 {'a', "", UDM_OPT_BOOL, NULL, "Revisit all documents even if not expired (can be\n"
582 "limited using -t, -u, -s, -c, -y and -f options)"},
583 {'m', "", UDM_OPT_BOOL, NULL, "Update expired documents even if not modified (can be\n"
584 "limited using -t, -u, -c, -s, -y and -f options)"},
585 {'e', "", UDM_OPT_BOOL, NULL, "Visit 'most expired' (oldest) documents first"},
586 {'o', "", UDM_OPT_BOOL, NULL, "Visit documents with less depth (hops value) first"},
587 {'r', "", UDM_OPT_BOOL, NULL, "Do not try to reduce remote servers load by randomising\n"
588 "crawler queue order (faster, but less polite)"},
589 {'n', "", UDM_OPT_INT, NULL, "Visit only # documents and exit"},
590 {'c', "", UDM_OPT_INT, NULL, "Visit only # seconds and exit"},
591 {'q', "", UDM_OPT_BOOL, NULL, "Quick startup (do not add Server URLs); -qq even quicker"},
592 {'b', "", UDM_OPT_BOOL, NULL, "Block starting more than one indexer instances"},
593 {'i', "", UDM_OPT_BOOL, NULL, "Insert new URLs (URLs to insert must be given using -u or -f)"},
594 {'p', "", UDM_OPT_INT, NULL, "Sleep # seconds after downloading every URL"},
595 {'w', "", UDM_OPT_BOOL, NULL, "Do not ask for confirmation when clearing documents\n"
596 "from the database (e.g.: indexer -Cw)"},
597 {'N', "", UDM_OPT_INT, NULL, "Run # threads (for crawling or indexing)"},
598
599
600 {-1, "", UDM_OPT_TITLE,NULL, "\nSubsection control options (can be combined):"},
601 {'s', "", UDM_OPT_STR, NULL, "Limit indexer to documents matching status (HTTP Status code)"},
602 {'t', "", UDM_OPT_STR, NULL, "Limit indexer to documents matching tag"},
603 {'y', "", UDM_OPT_STR, NULL, "Limit indexer to documents matching content-type"},
604 {'L', "", UDM_OPT_STR, NULL, "Limit indexer to documents matching language"},
605 {'u', "", UDM_OPT_STR, NULL, "Limit indexer to documents with URLs matching pattern\n"
606 "(supports SQL LIKE wildcards '%' and '_')"},
607 {0, "seed",UDM_OPT_STR, NULL, "Limit indexer to documents with the given seed (0-255)"},
608 {'D', "", UDM_OPT_STR, NULL, "Work with the n-th database only (i.e. with the n-th DBAddr)"},
609 {'f', "", UDM_OPT_STR, NULL, "Read URLs to be visited/inserted/deleted from file (with -a\n"
610 "or -C option, supports SQL LIKE wildcard '%%'; has no effect\n"
611 "when combined with -m option)"},
612 {-1, "", UDM_OPT_TITLE,NULL,
613 " -f - Use stdin instead of a file as an URL list"},
614
615
616 {-1, "", UDM_OPT_TITLE,NULL, "\nLogging options:"},
617 {'l', "", UDM_OPT_BOOL, NULL, "Do not log to stdout/stderr"},
618 {'v', "", UDM_OPT_INT, NULL, "Verbose level (0-5)"},
619
620
621 {-1, "", UDM_OPT_TITLE,NULL, "\nMisc. options:"},
622 {'F', "", UDM_OPT_STR, NULL, "Print compile configuration and exit (e.g.: indexer -F '*')"},
623 {'h',"help",UDM_OPT_BOOL, NULL,"Print help page and exit; -hh print more help"},
624 {'?', "", UDM_OPT_BOOL, NULL, "Print help page and exit; -?? print more help"},
625 {'d', "", UDM_OPT_STR, NULL, "Use the given configuration file instead of indexer.conf"
626 #ifndef WIN32
627 "\nThis option is usefull when running indexer as an\n"
628 "interpreter, e.g.: #!/usr/local/sbin/indexer -d"
629 #endif
630 },
631 {'j', "", UDM_OPT_STR, NULL, "Set current time for statistic (use with -S),\n"
632 "format: YYYY-MM[-DD[ HH[:MM[:SS]]]]\n"
633 "or time offset, e.g. 1d12h (see Period in indexer.conf)"},
634 {UDM_IND_SET, "set", UDM_OPT_STR, NULL, "Set variable"},
635 {-1, "", UDM_OPT_TITLE,NULL, "\nCommands (can be used with subsection control options):"},
636 {UDM_IND_CRAWL, "crawl", UDM_OPT_BOOL,NULL, "Crawl (default command)"},
637 {UDM_IND_MULTI2BLOB, "index", UDM_OPT_BOOL,NULL, "Create search index"},
638 {UDM_IND_WRDSTAT, "wordstat", UDM_OPT_BOOL,NULL, "Create statistics for misspelled word suggestions"},
639 {UDM_IND_REWRITEURL, "rewriteurl", UDM_OPT_BOOL,NULL, "Rewrite URL data into the current search index"},
640 {UDM_IND_REWRITELIMITS,"rewritelimits",UDM_OPT_BOOL,NULL, "Recreate all Limit, UserScore, UserOrder data"},
641 {UDM_IND_REWRITEPOP, "rewritepop", UDM_OPT_BOOL,NULL, "Recreate popularity data"},
642 {UDM_IND_DELETE, /*C*/ "delete", UDM_OPT_BOOL,NULL, "Delete documents from the database"},
643 {UDM_IND_STAT, /*S*/ "statistics", UDM_OPT_BOOL,NULL, "Print statistics and exit"},
644 {UDM_IND_REFERERS,/*I*/ "referers", UDM_OPT_BOOL,NULL, "Print referers and exit "},
645
646 {-1, "", UDM_OPT_TITLE,NULL, "\nOther commands:"},
647 #ifdef HAVE_SQL /* TODO34 */
648 {UDM_IND_CREATE, "create", UDM_OPT_BOOL,NULL, "Create SQL table structure and exit"},
649 {UDM_IND_DROP, "drop", UDM_OPT_BOOL,NULL, "Drop SQL table structure and exit"},
650 {UDM_IND_SQLMON, /*Q*/ "sqlmon", UDM_OPT_BOOL,NULL, "Run interactive SQL monitor"},
651 {UDM_IND_EXECSQL, "exec", UDM_OPT_STR, NULL, "Execute SQL query"},
652 #endif
653 {UDM_IND_CHECKCONF, "checkconf", UDM_OPT_BOOL,NULL, "Check configuration file for good syntax"},
654 {UDM_IND_EXPORT, "export", UDM_OPT_BOOL,NULL, NULL}, /* TODO */
655 {UDM_IND_HASHSPELL, "hashspell", UDM_OPT_BOOL,NULL, "Create hash files for the active Ispell dictionaries"},
656 {UDM_IND_DUMPSPELL, "dumpspell", UDM_OPT_BOOL,NULL, "Dump Ispell data for use with SQLWordForms"},
657 {UDM_IND_DUMPCONF, "dumpconf", UDM_OPT_BOOL,NULL, NULL},
658 {UDM_IND_DUMPDATA, "dumpdata", UDM_OPT_BOOL,NULL, "Dump collected data using SQL statements"},
659 {UDM_IND_RESTOREDATA, "restoredata", UDM_OPT_BOOL,NULL, "Load prevously dumped data (give a filename using -f)"},
660
661 {UDM_IND_SET0, "fl", UDM_OPT_STR, NULL, NULL},
662 {0,NULL,0,NULL,NULL}
663 };
664
665
666 static int
UdmParseCmdLine(UDM_INDEXER_OPTIONS * io,int argc,char ** argv,size_t * noptions)667 UdmParseCmdLine(UDM_INDEXER_OPTIONS *io,
668 int argc, char **argv, size_t *noptions)
669 {
670 return UdmCmdLineOptionsGet(io, argc, argv, udm_indexer_options,
671 UdmCmdLineHandleOption, noptions);
672 }
673
674
675 /*
676 static int
677 UdmReloadEnv(UDM_AGENT *Indexer)
678 {
679 UDM_ENV NewConf;
680 int rc;
681
682 UdmLog(Indexer,UDM_LOG_ERROR,"Reloading config '%s'",cname);
683 UdmEnvInit(&NewConf);
684 UdmSetLockProc(&NewConf,UdmLockProc);
685 UdmSetRefProc(&NewConf,UdmRefProc);
686
687 Indexer->Conf = &NewConf;
688 rc = UdmIndexerEnvLoad(Indexer, cname, add_servers + load_langmaps + UDM_FLAG_SPELL);
689 Indexer->Conf = &Conf;
690
691 if (rc!=UDM_OK)
692 {
693 UdmLog(Indexer,UDM_LOG_ERROR,"Can't load config: %s",UdmEnvErrMsg(&NewConf));
694 UdmLog(Indexer,UDM_LOG_ERROR,"Continuing with old config");
695 UdmEnvFree(&NewConf);
696 }
697 else
698 {
699 size_t noptions;
700 UdmEnvFree(&Conf);
701 Conf=NewConf;
702 UdmParseCmdLine(UdmARGC, UdmARGV, &noptions);
703 #ifndef WIN32
704 UdmOpenLog("indexer", &Conf, log2stderr);
705 #endif
706 }
707 return UDM_OK;
708 }
709 */
710
711
712 static size_t
create_shared_info(UDM_AGENT * A,char * str,size_t len)713 create_shared_info(UDM_AGENT *A, char *str, size_t len)
714 {
715 UDM_ENV *Env= A->Conf;
716 size_t res= udm_snprintf(str, len,
717 "Hrefs: %d,"
718 "Targets: %d,"
719 "Cookies: %d,"
720 "Robots: %d,"
721 "Hosts: %d,"
722 "IPs: %d",
723 (int) Env->Hrefs.nhrefs,
724 (int) Env->Targets.num_rows,
725 (int) Env->Cookies.nvars,
726 (int) Env->Robots.nrobots,
727 (int) Env->Hosts.nhost_addr,
728 (int) Env->InAddr.nitems);
729 return res;
730 }
731
732
733 static udm_rc_t
httpd_client_handler(int client,UDM_AGENT * A)734 httpd_client_handler(int client, UDM_AGENT *A)
735 {
736 char request[4096];
737 char response[1024];
738 char speed_info[128]= "";
739 char shared_info[128]= "";
740 ssize_t nrecv;
741 size_t i, len, total_docs= 0, total_sec= 0;
742 udm_uint8 total_bytes= 0;
743 time_t now= time(0);
744
745 nrecv= recv(client, request, sizeof(request), 0);
746 UdmLog(A, UDM_LOG_ERROR, "Received request len=%d", (int) nrecv);
747 udm_snprintf(response, sizeof(response) - 1,
748 "HTTP/1.0 200 OK\r\nContent-Type: text/html\r\n\r\n");
749 UdmSend(client, response, strlen(response), 0);
750 len= sprintf(response,
751 "Threads:"
752 "<table border=1 cellspacing=1 cellpadding=1>\n"
753 "<tr><th>ID</th>"
754 "<th>Docs</th>"
755 "<th>Size</th>"
756 "<th>Task</th>"
757 "<th>Time</th>"
758 "<th>Param</th>"
759 "<th>Extra</th></tr>");
760 UdmSend(client, response, len, 0);
761 for (i= 0; i < (size_t) maxthreads; i++)
762 {
763 UDM_AGENT *Tmp= &ThreadCrawlers[i].Agent;
764 char mutex_owned_info[64]= "", *mi;
765 size_t sec;
766 udm_mutexno_t mutex;
767
768 for (mutex= 0, mi= mutex_owned_info;
769 mutex < UDM_LOCK_MAX;
770 mutex++)
771 {
772 mi+= UdmMutexStatePrint(mi, sizeof(mutex_owned_info) - (mi - mutex_owned_info),
773 Tmp, mutex);
774 }
775
776 len= sprintf(response,
777 "<tr><td>%d</td>"
778 "<td align=right>%d</td>"
779 "<td align=right>%llu</td>"
780 "<td>%s</td>"
781 "<td align=right>%d</td>"
782 "<td>%s </td>"
783 "<td>%s%s </td></tr>\n",
784 Tmp->handle,
785 (int) Tmp->ndocs,
786 (unsigned long long) Tmp->nbytes,
787 Tmp->State.task,
788 (int) (now - Tmp->State.start_time),
789 UDM_NULL2EMPTY(Tmp->State.param),
790 UDM_NULL2EMPTY(Tmp->State.extra),
791 mutex_owned_info);
792 UdmSend(client, response, len, 0);
793 total_docs+= Tmp->ndocs;
794 total_bytes+= Tmp->nbytes;
795 sec= (size_t) (now - Tmp->start_time);
796 if (sec > total_sec)
797 total_sec= sec;
798 }
799 if (total_sec)
800 {
801 udm_snprintf(speed_info, sizeof(speed_info) - 1,
802 "%d seconds, %d docs/sec, %d bytes/sec",
803 (int) total_sec,
804 (int) (total_docs / total_sec),
805 (int) (total_bytes / total_sec));
806 }
807
808 len= sprintf(response,
809 "<tr><td> </td>"
810 "<td align=right>%d</td>"
811 "<td align=right>%llu</td>"
812 "<td> </td>"
813 "<td align=right> </td>"
814 "<td>%s </td>"
815 "<td> </td></tr>\n",
816 (int) total_docs,
817 (unsigned long long) total_bytes,
818 speed_info);
819 UdmSend(client, response, len, 0);
820 len= sprintf(response, "</table>\n");
821 UdmSend(client, response, len, 0);
822 len= create_shared_info(A, shared_info, sizeof(shared_info));
823 UdmSend(client, shared_info, len, 0);
824 return UDM_OK;
825 }
826
827
828 #ifdef WIN32
thread_main_httpd(void * arg)829 unsigned int __stdcall thread_main_httpd(void *arg)
830 #else
831 static void* thread_main_httpd(void *arg)
832 #endif
833 {
834 UDM_AGENT *A= (UDM_AGENT*) arg;
835 #ifndef WIN32
836 UdmStartHTTPD(A, httpd_client_handler);
837 #endif
838 return 0;
839 }
840
841
842
843 static char pidname[1024];
844 static char time_pid[100];
845
exitproc(void)846 static void exitproc(void)
847 {
848 unlink(pidname);
849 }
850
851
time_pid_info(void)852 static char * time_pid_info(void)
853 {
854 struct tm * tim;
855 time_t t;
856 t= time(NULL);
857 tim= localtime(&t);
858 strftime(time_pid,sizeof(time_pid),"%a %d %H:%M:%S",tim);
859 sprintf(time_pid+strlen(time_pid)," [%d]",(int)getpid());
860 return(time_pid);
861 }
862
863
UdmWSAStartup(void)864 static void UdmWSAStartup(void)
865 {
866 #ifdef WIN32
867 WSADATA wsaData;
868 if (WSAStartup(0x101,&wsaData)!=0)
869 {
870 fprintf(stderr,"WSAStartup() error %d\n",WSAGetLastError);
871 exit(1);
872 }
873 #endif
874 }
875
UdmWSACleanup(void)876 static void UdmWSACleanup(void)
877 {
878 #ifdef WIN32
879 WSACleanup();
880 #endif
881 return;
882 }
883
UdmConfirm(const char * msg)884 static int UdmConfirm(const char *msg)
885 {
886 char str[5];
887 printf("%s",msg);
888 return (fgets(str,sizeof(str),stdin) && !strncmp(str,"YES",3));
889 }
890
891
892 static udm_rc_t
UdmIndDelete(UDM_AGENT * A,UDM_QUERY * Query,const UDM_INDEXER_OPTIONS * io)893 UdmIndDelete(UDM_AGENT *A, UDM_QUERY *Query, const UDM_INDEXER_OPTIONS *io)
894 {
895 int clear_confirmed=1;
896 udm_rc_t rc= UDM_OK;
897 if (io->warnings)
898 {
899 size_t i;
900 printf("You are going to delete content from the database(s):\n");
901 for (i = 0; i < Conf.DBList.nitems; i++)
902 {
903 UDM_DB *db= &Conf.DBList.Item[i];
904 char dbaddr[128];
905 size_t nbytes;
906 db->dbhandler->Info(db, dbaddr, sizeof(dbaddr), &nbytes, UDM_DBINFO_ADDR);
907 printf("%s\n", dbaddr);
908 }
909 clear_confirmed=UdmConfirm("Are you sure?(YES/no)");
910 }
911
912 if (clear_confirmed)
913 {
914 if (io->url_filename)
915 {
916 rc= UdmURLFile(A, io->url_filename, UDM_URL_FILE_CLEAR);
917 }
918 else
919 {
920 rc= UdmQueryAction(A, Query, UDM_QUERYCMD_CLEAR);
921 }
922 }
923 else
924 {
925 printf("Canceled\n");
926 }
927 if (rc != UDM_OK)
928 {
929 fflush(stdout);
930 UdmLog(A, UDM_LOG_ERROR, "Error: '%s'", UdmEnvErrMsg(A->Conf));
931 }
932 return rc;
933 }
934
935 static void
UdmPrintCumulativeStatistics(UDM_AGENT * Indexer,UDM_CRAWLER * Item,size_t nitems,time_t sec)936 UdmPrintCumulativeStatistics(UDM_AGENT *Indexer,
937 UDM_CRAWLER *Item, size_t nitems, time_t sec)
938 {
939 size_t i;
940 unsigned int ndocs= 0;
941 unsigned long long nbytes= 0;
942 double M= 0.0, K= 0.0;
943
944 for (i= 0; i < nitems; i++)
945 {
946 ndocs+= Item[i].Agent.ndocs;
947 nbytes+= Item[i].Agent.nbytes;
948 }
949 if (sec > 0)
950 {
951 /* Convert to int64 - conversion from uint64 to double doesn't work on windows */
952 M= ((udm_timer_t) nbytes) / 1048576.0 / sec;
953 if (M < 1.0) K= ((udm_timer_t) nbytes) / 1024.0 / sec;
954 }
955 UdmLog(Indexer, UDM_LOG_ERROR,
956 "Done (%d seconds, %u documents, %llu bytes, %5.2f %cbytes/sec.)",
957 (int) sec, (unsigned int) ndocs, (unsigned long long) nbytes,
958 (M < 1.0) ? K : M, (M < 1.0) ? 'K' : 'M' );
959 }
960
961
962 static udm_rc_t
UdmCrawl(UDM_AGENT * A,UDM_QUERY * Query)963 UdmCrawl(UDM_AGENT *A, UDM_QUERY *Query)
964 {
965 UDM_AGENT httpd_agent;
966 size_t nbytes;
967 const UDM_VAR *Listen;
968 time_t start_time= time(0);
969
970 maxthreads= UdmVarListFindInt(&A->Conf->Vars, "CrawlerThreads", 1);
971 nbytes= maxthreads * sizeof(UDM_CRAWLER);
972 ThreadCrawlers= (UDM_CRAWLER*) UdmMalloc(nbytes);
973 bzero((void*) ThreadCrawlers, nbytes);
974 UdmAgentInit(&httpd_agent, A->Conf, 0);
975
976 if ((Listen= UdmVarListFind(&A->Conf->Vars, "Listen")))
977 {
978 #ifdef HAVE_PTHREAD
979 if (UdmVarStr(Listen) && UdmVarStr(Listen)[0])
980 {
981 udm_thread_t httpd_thread;
982 UdmThreadCreate(&httpd_thread, thread_main_httpd, &httpd_agent);
983 }
984 else
985 #endif
986 {
987 UdmLog(A, UDM_LOG_ERROR, "Not starting HTTPD");
988 }
989 }
990
991
992 #ifdef HAVE_PTHREAD
993 {
994 int i;
995 for (i= 0; i < maxthreads; i++)
996 {
997 UdmAgentInit(&ThreadCrawlers[i].Agent, A->Conf, i + 1);
998 ThreadCrawlers[i].Query= Query;
999 ThreadCrawlers[i].Agent.flags= flags;
1000 }
1001 }
1002
1003 {
1004 int i;
1005 nbytes= maxthreads * sizeof(udm_thread_t);
1006 threads= (udm_thread_t*) UdmMalloc(nbytes);
1007 bzero((void*) threads, nbytes);
1008 for(i= 0; i < maxthreads; i++)
1009 {
1010 if (seconds) UDMSLEEP(seconds);
1011
1012 UdmThreadCreate(&threads[i], &UdmCrawlerMain, &ThreadCrawlers[i]);
1013
1014 UDM_GETLOCK(A, UDM_LOCK_THREAD);
1015 total_threads= i + 1;
1016 UDM_RELEASELOCK(A,UDM_LOCK_THREAD);
1017 }
1018 #ifndef WIN32
1019 for (i = 0; i < maxthreads; i++)
1020 pthread_join(threads[i], NULL);
1021 #else
1022 while(1)
1023 {
1024 int num;
1025 UDM_GETLOCK(A,UDM_LOCK_THREAD);
1026 num=total_threads;
1027 UDM_RELEASELOCK(A,UDM_LOCK_THREAD);
1028 if (!num)break;
1029 UDMSLEEP(1);
1030 }
1031 #endif
1032 UDM_FREE(threads);
1033 }
1034 #else
1035 A->handle = 1;
1036 UdmCrawlerMain(A);
1037 #endif
1038
1039 if (maxthreads > 1)
1040 UdmPrintCumulativeStatistics(A, ThreadCrawlers, maxthreads,
1041 time(0) - start_time);
1042 UdmAgentFree(&httpd_agent);
1043 UDM_FREE(ThreadCrawlers);
1044 return thd_errors ? UDM_ERROR : UDM_OK;
1045 }
1046
1047
1048 static int
UdmDumpDocument(UDM_AGENT * A,UDM_DOCUMENT * D)1049 UdmDumpDocument(UDM_AGENT *A, UDM_DOCUMENT *D)
1050 {
1051 size_t i;
1052 printf(" <item>\n");
1053 /*
1054 printf(" <link>%s</link>\n", UdmVarListFindStr(&D->Sections, "URL", ""));
1055 printf(" <id>%s</id>\n", UdmVarListFindStr(&D->Sections, "ID", ""));
1056 printf(" <content-length>%s</content-length>\n", UdmVarListFindStr(&D->Sections, "Content-Length", ""));
1057 printf(" <status>%s</status>\n", UdmVarListFindStr(&D->Sections, "Status", ""));
1058 printf(" <hops>%s</hops>\n", UdmVarListFindStr(&D->Sections, "Hops", ""));
1059 printf(" <crc32>%s</crc32>\n", UdmVarListFindStr(&D->Sections, "crc32", ""));
1060 printf(" <modified>%s</modified>\n", UdmVarListFindStr(&D->Sections, "Last-Modified", ""));
1061 */
1062 for (i= 0; i < D->Sections.nvars; i++)
1063 {
1064 const UDM_VAR *S= UdmVarListFindConstByIndex(&D->Sections, i);
1065 UDM_CONST_STR strbuf, *str= UdmVarGetConstStr(S, &strbuf);
1066 printf(" <%s>%.*s</%s>\n",
1067 UdmVarName(S), (int) str->length, str->str, UdmVarName(S));
1068 }
1069 printf(" </item>\n");
1070 return 0;
1071 }
1072
1073
1074 static udm_rc_t
UdmDumpDocuments(UDM_AGENT * A,UDM_QUERY * Query)1075 UdmDumpDocuments(UDM_AGENT *A, UDM_QUERY *Query)
1076 {
1077 udm_rc_t rc;
1078 A->Conf->DumpDoc= UdmDumpDocument;
1079
1080 /* printf("<rss><channel>\n"); */
1081 if (UDM_OK != (rc= UdmQueryAction(A, Query, UDM_QUERYCMD_DUMPDATA)))
1082 return rc;
1083 /* printf("</channel></rss>\n"); */
1084 return UDM_OK;
1085 }
1086
1087
1088 #include <sys/stat.h>
1089 /*
1090 TODO for dump/restore:
1091 - Escape bad characters to entities on dump, unescape on restore.
1092 - Huge file size
1093 - Subsection control
1094 - Database number limit
1095 - Insert into database
1096 */
1097 static udm_rc_t
UdmRestoreDocuments(UDM_AGENT * A,const UDM_INDEXER_OPTIONS * io)1098 UdmRestoreDocuments(UDM_AGENT *A, const UDM_INDEXER_OPTIONS *io)
1099 {
1100 udm_rc_t rc= UDM_OK;
1101 UDM_DSTR dstr;
1102 struct stat sb;
1103 int fd;
1104 UDM_CHARSET *cs;
1105
1106 if (!io->url_filename)
1107 {
1108 fprintf(stderr, "Required option -f ommitted\n");
1109 return UDM_ERROR;
1110 }
1111 fprintf(stderr, "Restoring data from '%s'\n", io->url_filename);
1112
1113 if (stat(io->url_filename, &sb))
1114 {
1115 fprintf(stderr, "Can't stat '%s'\n", io->url_filename);
1116 return UDM_ERROR;
1117 }
1118 UdmDSTRInit(&dstr, 1024);
1119 if ((fd= open(io->url_filename, O_RDONLY|UDM_BINARY)) <= 0)
1120 {
1121 rc= UDM_ERROR;
1122 fprintf(stderr, "Can't open '%s'", io->url_filename);
1123 goto ex;
1124 }
1125
1126 if (UDM_OK != (rc= UdmDSTRReadFile(&dstr, fd, sb.st_size)))
1127 {
1128 rc= UDM_ERROR;
1129 #ifndef WIN32
1130 fprintf(stderr, "Failed to read %lld bytes from '%s'\n",
1131 (long long) sb.st_size, io->url_filename);
1132 #endif
1133 goto ex;
1134 }
1135
1136 cs= &udm_charset_latin1;
1137 rc= UdmQueryFromXML(A, NULL, UdmDSTRPtr(&dstr), UdmDSTRLength(&dstr), cs);
1138 ex:
1139 UdmDSTRFree(&dstr);
1140 return rc;
1141 }
1142
1143
1144 static udm_rc_t
UdmIndHashSpell(UDM_AGENT * Agent)1145 UdmIndHashSpell(UDM_AGENT *Agent)
1146 {
1147 UDM_ENV *Env= Agent->Conf;
1148 char errmsg[256];
1149 udm_rc_t rc;
1150 if (UDM_OK != (rc= UdmSpellListListLoad(&Env->Spells, errmsg, sizeof(errmsg)))||
1151 UDM_OK != (rc= UdmSpellListListWriteHash(&Env->Spells, errmsg, sizeof(errmsg))))
1152 {
1153 fprintf(stderr, "error: %s\n", errmsg);
1154 }
1155 return rc;
1156 }
1157
1158
1159 static udm_rc_t
UdmIndDumpSpell(UDM_AGENT * Agent)1160 UdmIndDumpSpell(UDM_AGENT *Agent)
1161 {
1162 char errmsg[256];
1163 int spflags= UDM_SPELL_NOPREFIX;
1164 UDM_ENV *Env= Agent->Conf;
1165 udm_rc_t rc;
1166 if (UDM_OK != (rc= UdmSpellListListLoad(&Env->Spells,
1167 errmsg, sizeof(errmsg)))||
1168 UDM_OK != (rc= UdmAffixListListLoad(&Env->Affixes, spflags,
1169 errmsg, sizeof(errmsg)))||
1170 UDM_OK != (rc= UdmSpellDump(&Env->Spells, &Env->Affixes,
1171 errmsg, sizeof(errmsg))))
1172 {
1173 fprintf(stderr, "error: %s\n", errmsg);
1174 }
1175 return rc;
1176 }
1177
1178
1179 static udm_rc_t
UdmIndSQLMonitor(UDM_AGENT * Agent,UDM_INDEXER_OPTIONS * io)1180 UdmIndSQLMonitor(UDM_AGENT *Agent, UDM_INDEXER_OPTIONS *io)
1181 {
1182 udm_rc_t rc;
1183 UDM_SQLMON_PARAM prm;
1184 bzero((void*)&prm,sizeof(prm));
1185 prm.flags= loglevel > 0 ? UDM_SQLMON_DISPLAY_FIELDS : 0;
1186 execsql= UdmVarListFindStr(&Conf.Vars, "exec", NULL);
1187 prm.iohandler.gets= execsql ? sqlexecgets : sqlmongets;
1188 prm.iohandler.prompt= sqlmonprompt;
1189 prm.iohandler.user_data= &prm;
1190 #ifdef HAVE_READLINE
1191 if (isatty(0))
1192 prm.mode= udm_sqlmon_mode_interactive;
1193 #endif
1194 rc= UdmSQLMonitor(Agent, &Conf, &prm);
1195 if (prm.iohandler.gets == sqlmongets)
1196 {
1197 /* Display "\n" after the last ">SQL" prompt */
1198 sqlmonprompt(&prm.iohandler, UDM_MSG_EOL, "");
1199 }
1200 return rc;
1201 }
1202
1203
1204 static udm_rc_t
UdmIndBlock(void)1205 UdmIndBlock(void)
1206 {
1207 int pid_fd;
1208 char pidbuf[128];
1209 /* Check that another instance isn't running and create PID file. */
1210 const char *vardir= UdmVarListFindStr(&Conf.Vars,"VarDir",UDM_VAR_DIR);
1211 sprintf(pidname,"%s/%s", vardir ,"indexer.pid");
1212 pid_fd = open(pidname,O_CREAT|O_EXCL|O_WRONLY,0644);
1213 if (pid_fd < 0)
1214 {
1215 fprintf(stderr,"%s Can't create '%s': %s\n", time_pid_info(), pidname, strerror(errno));
1216 if (errno == EEXIST)
1217 {
1218 fprintf(stderr,"It seems that another indexer is already running!\n");
1219 fprintf(stderr,"Remove '%s' if it is not true.\n",pidname);
1220 }
1221 return UDM_ERROR;
1222 }
1223 udm_snprintf(pidbuf, sizeof(pidbuf), "%d\n", (int)getpid());
1224 write(pid_fd, &pidbuf, strlen(pidbuf));
1225 #ifdef HAVE_ATEXIT
1226 atexit(&exitproc);
1227 #endif
1228 return UDM_OK;
1229 }
1230
1231
1232 static udm_rc_t
UdmIndInsertFromFile(UDM_AGENT * Agent,UDM_INDEXER_OPTIONS * io)1233 UdmIndInsertFromFile(UDM_AGENT *Agent, UDM_INDEXER_OPTIONS *io)
1234 {
1235 udm_rc_t rc= UDM_OK;
1236 if (strcmp(io->url_filename,"-"))
1237 {
1238 /* Make sure all URLs to be inserted are OK */
1239 if (UDM_OK != (rc= UdmURLFile(Agent, io->url_filename, UDM_URL_FILE_PARSE)))
1240 {
1241 UdmLog(Agent, UDM_LOG_ERROR,"Error: Invalid URL in '%s'", io->url_filename);
1242 goto ex;
1243 }
1244 }
1245 if (UDM_OK != (rc= UdmURLFile(Agent, io->url_filename, UDM_URL_FILE_INSERT)))
1246 {
1247 UdmLog(Agent, UDM_LOG_ERROR,"Error: '%s'", UdmEnvErrMsg(Agent->Conf));
1248 goto ex;
1249 }
1250 ex:
1251 return rc;
1252 }
1253
1254
1255 static udm_rc_t
UdmIndExpire(UDM_AGENT * Agent,UDM_QUERY * Query,const UDM_INDEXER_OPTIONS * io)1256 UdmIndExpire(UDM_AGENT *Agent, UDM_QUERY *Query, const UDM_INDEXER_OPTIONS *io)
1257 {
1258 udm_rc_t rc= io->url_filename ?
1259 UdmURLFile(Agent, io->url_filename, UDM_URL_FILE_REINDEX) :
1260 UdmQueryAction(Agent, Query, UDM_QUERYCMD_EXPIRE);
1261 return rc;
1262 }
1263
1264
1265 static udm_rc_t
UdmIndCrawl(UDM_AGENT * Agent,UDM_QUERY * Query,const UDM_INDEXER_OPTIONS * io)1266 UdmIndCrawl(UDM_AGENT *Agent, UDM_QUERY *Query, const UDM_INDEXER_OPTIONS *io)
1267 {
1268 udm_rc_t rc;
1269 if (io->block && UDM_OK != (rc= UdmIndBlock()))
1270 goto ex;
1271 UdmLog(Agent, UDM_LOG_WARN, "indexer from %s-%s-%s started with '%s'", PACKAGE, VERSION, UDM_DBTYPE, cname);
1272 UdmSigHandlersInit(Agent);
1273
1274 if (io->expire)
1275 {
1276 if (UDM_OK != (rc= UdmIndExpire(Agent, Query, io)))
1277 goto ex;
1278 }
1279
1280 if (UDM_OK != (rc= UdmCrawl(Agent, Query)))
1281 goto ex;
1282
1283 ex:
1284 return rc;
1285 }
1286
1287
main(int argc,char ** argv)1288 int main(int argc, char **argv)
1289 {
1290 UDM_INDEXER_OPTIONS io;
1291 UDM_AGENT Main;
1292 UDM_QUERY Query;
1293 udm_rc_t rc= UDM_OK;
1294 char *REQUEST_METHOD= getenv("REQUEST_METHOD");
1295 FILE *logfile= REQUEST_METHOD ? stdout : stderr;
1296 size_t noptions;
1297 #ifdef CHASEN
1298 char *chasen_argv[] = { "chasen", "-b", "-f", "-F", "%m ", NULL };
1299 chasen_getopt_argv(chasen_argv, NULL);
1300 #endif
1301
1302 UdmIndexerOptionsInit(&io);
1303
1304 if (REQUEST_METHOD)
1305 printf("Content-Type: text/plain\r\n\r\n");
1306
1307 UdmWSAStartup();
1308
1309 UdmInit(); /* Initialize library */
1310
1311 UdmInitMutexes();
1312 UdmEnvInit(&Conf);
1313 UdmVarListAddEnviron(&Conf.Vars,"ENV");
1314 UdmSetLockProc(&Conf,UdmLockProc);
1315
1316 #if defined(HAVE_PTHREAD) && !defined(WIN32)
1317 Conf.THDHandler.ThreadCreate= UdmThreadCreate;
1318 Conf.THDHandler.ThreadJoin= UdmThreadJoin;
1319 #endif
1320
1321 UdmSetRefProc(&Conf,UdmRefProc);
1322 #ifdef THINFO_TEST
1323 UdmSetThreadProc(&Conf,UdmShowThreadInfoProc);
1324 #endif
1325 UdmAgentInit(&Main,&Conf,0);
1326 UdmQueryInit(&Query);
1327
1328 UdmARGC= argc;
1329 UdmARGV= argv;
1330
1331 if (UdmParseCmdLine(&io, UdmARGC, UdmARGV, &noptions))
1332 goto ex;
1333
1334 if (io.cmd == UDM_IND_INDEX)
1335 {
1336 fprintf(stderr, "\nWARNING: \"indexer -Eindex\" is deprecated. "
1337 "Use \"indexer -Ecrawl\" instead!\n\n");
1338 io.cmd= UDM_IND_CRAWL;
1339 }
1340
1341 if (io.cmd == UDM_IND_AMBIGUOUS)
1342 {
1343 fprintf(stderr, "Ambiguous indexer command in -E\n");
1344 io.help++;
1345 }
1346
1347 if (io.cmd == UDM_IND_UNKNOWN)
1348 {
1349 fprintf(stderr, "Unknown indexer command in -E\n");
1350 io.help++;
1351 }
1352
1353 if (io.cmd == UDM_IND_DUMPCONF)
1354 {
1355 io.load_for_dump|= UDM_FLAG_DONT_ADD_TO_DB;
1356 io.load_langmaps= 0;
1357 io.load_spells= 0;
1358 }
1359 else if (io.cmd != UDM_IND_CRAWL)
1360 {
1361 if (io.cmd != UDM_IND_MULTI2BLOB && /* TODO34: rename this to UDM_IND_INDEX */
1362 io.cmd != UDM_IND_REWRITEPOP)
1363 io.add_servers= 0;
1364 else
1365 {
1366 io.add_servers|= UDM_FLAG_DONT_ADD_TO_DB;
1367 io.add_server_urls= 0;
1368 }
1369 io.load_langmaps= 0;
1370 if (io.cmd != UDM_IND_HASHSPELL && io.cmd != UDM_IND_DUMPSPELL)
1371 io.load_spells= 0;
1372 }
1373
1374 flags|= io.add_servers;
1375 flags |= io.add_server_urls;
1376 Main.flags= flags;
1377
1378 argc-= noptions;
1379 argv+= noptions;
1380
1381 if (argc > 1 || io.help)
1382 {
1383 usage(io.help, udm_indexer_options);
1384 rc= UDM_ERROR;
1385 goto ex;
1386 }
1387
1388 if (!*cname)
1389 {
1390 if (argc == 1)
1391 {
1392 strncpy(cname,argv[0],sizeof(cname));
1393 cname[sizeof(cname)-1]='\0';
1394 }
1395 else
1396 {
1397 const char *cd=UdmVarListFindStr(&Conf.Vars,"ConfDir",UDM_CONF_DIR);
1398 udm_snprintf(cname,sizeof(cname),"%s%s%s",cd,UDMSLASHSTR,"indexer.conf");
1399 cname[sizeof(cname)-1]='\0';
1400 }
1401 }
1402
1403
1404 if (UDM_OK != (rc= UdmIndexerEnvLoad(&Main, cname,
1405 io.add_servers + io.load_langmaps +
1406 io.load_spells +
1407 io.add_server_urls + io.load_for_dump)))
1408 {
1409 fprintf(logfile, "%s\n", UdmEnvErrMsg(&Conf));
1410 goto ex;
1411 }
1412
1413
1414 #ifdef HAVE_LOCALE_H
1415 /*
1416 Needed for Mimer to make non-ascii Latin1 characters work.
1417 The locale should typically be set to en_US.iso88591
1418 */
1419 if (UdmVarListFind(&Conf.Vars, "Locale"))
1420 setlocale(LC_ALL, UdmVarListFindStr(&Conf.Vars, "Locale", "C"));
1421 #endif
1422
1423 if (io.url_filename && strcmp(io.url_filename,"-"))
1424 {
1425 /* Make sure URL file is readable if not STDIN */
1426 FILE *url_file;
1427 if (!(url_file= fopen(io.url_filename,"r")))
1428 {
1429 UdmLog(&Main, UDM_LOG_ERROR,
1430 "Error: can't open url file '%s': %s",
1431 io.url_filename, strerror(errno));
1432 goto ex;
1433 }
1434 fclose(url_file);
1435 }
1436
1437
1438
1439 if (io.cmd == UDM_IND_DUMPCONF)
1440 {
1441 if (UDM_OK != (rc= UdmEnvSave(&Main, "-", 0)))
1442 fprintf(logfile, "%s\n", UdmEnvErrMsg(&Conf));
1443 goto ex;
1444 }
1445
1446
1447 if (io.cmd == UDM_IND_CHECKCONF)
1448 {
1449 rc= 0;
1450 goto ex;
1451 }
1452
1453
1454 UdmEnvSetLogLevel(Main.Conf, io.have_loglevel ? loglevel :
1455 UdmVarListFindInt(&Main.Conf->Vars, "LogLevel", UDM_LOG_INFO));
1456 #ifndef WIN32
1457 UdmOpenLog("indexer", &Conf, io.log2stderr);
1458 #endif
1459
1460 if (io.insert && io.url_filename)
1461 {
1462 if (UDM_OK != UdmIndInsertFromFile(&Main, &io))
1463 {
1464 UdmLog(&Main,UDM_LOG_ERROR,"Error: '%s'",UdmEnvErrMsg(Main.Conf));
1465 goto ex;
1466 }
1467 }
1468
1469 switch (io.cmd)
1470 {
1471 case UDM_IND_HASHSPELL: rc= UdmIndHashSpell(&Main); goto ex;
1472 case UDM_IND_DUMPSPELL: rc= UdmIndDumpSpell(&Main); goto ex;
1473 case UDM_IND_DUMPDATA: rc= UdmDumpDocuments(&Main, &Query); goto ex;
1474 case UDM_IND_RESTOREDATA: rc= UdmRestoreDocuments(&Main, &io); goto ex;
1475 case UDM_IND_SQLMON: rc= UdmIndSQLMonitor(&Main, &io); goto ex;
1476 case UDM_IND_EXECSQL: rc= UdmIndSQLMonitor(&Main, &io); goto ex;
1477 case UDM_IND_MULTI2BLOB: rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_INDEX); goto ex;
1478 case UDM_IND_EXPORT: rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_EXPORT); goto ex;
1479 case UDM_IND_WRDSTAT: rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_WORDSTAT); goto ex;
1480 case UDM_IND_REWRITEURL: rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_REWRITE_URLDATA); goto ex;
1481 case UDM_IND_REWRITELIMITS: rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_REWRITE_LIMITS); goto ex;
1482 case UDM_IND_REWRITEPOP: rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_REWRITE_POPULARITY); goto ex;
1483 case UDM_IND_STAT: rc= ShowStatistics(&Main, &Query); goto ex;
1484 case UDM_IND_REFERERS: rc= ShowReferers(&Main, &Query);goto ex;
1485 case UDM_IND_CREATE: rc= UdmIndCreate(&Main); goto ex;
1486 case UDM_IND_DROP: rc= UdmIndDrop(&Main); goto ex;
1487 case UDM_IND_DELETE: rc= UdmIndDelete(&Main, &Query, &io); goto ex;
1488 case UDM_IND_CRAWL: rc= UdmIndCrawl(&Main, &Query, &io); goto ex;
1489 case UDM_IND_INDEX: rc= UdmIndCrawl(&Main, &Query, &io); goto ex;
1490
1491 case UDM_IND_AMBIGUOUS:
1492 case UDM_IND_UNKNOWN:
1493 case UDM_IND_CHECKCONF:
1494 case UDM_IND_CONVERT:
1495 case UDM_IND_DUMPCONF:
1496 case UDM_IND_SET:
1497 case UDM_IND_SET0:
1498 break;
1499 }
1500
1501
1502 ex:
1503 total_threads= 0;
1504 UdmAgentFree(&Main);
1505 UdmQueryFree(&Query);
1506 UdmEnvFree(&Conf);
1507 UdmDestroyMutexes();
1508 UdmWSACleanup();
1509 #ifndef HAVE_ATEXIT
1510 exitproc();
1511 #endif
1512 return rc == UDM_OK ? 0 : 1;
1513 }
1514