1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <errno.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <fcntl.h>
26 #include <signal.h>
27 #ifdef HAVE_LOCALE_H
28 #include <locale.h>
29 #endif
30 
31 #ifdef WIN32
32 #include <process.h>
33 #endif
34 
35 #ifdef HAVE_READLINE
36 #include <readline/readline.h>
37 #include <readline/history.h>
38 #endif
39 
40 #ifdef CHASEN
41 #include <chasen.h>
42 #endif
43 
44 #ifdef MECAB
45 #include <mecab.h>
46 #endif
47 
48 #include "udmsearch.h"
49 #include "udm_sqldbms.h"  /* TODO34: Remove this */
50 #include "udm_db_int.h"   /* TODO34: Remove this */
51 #include "udm_query.h"    /* TODO34: Remove this */
52 #include "udm_http.h"
53 
54 /* This should be last include */
55 #ifdef DMALLOC
56 #include "dmalloc.h"
57 #endif
58 
59 static int loglevel= UDM_LOG_INFO;
60 static char cname[1024]= "";
61 
62 static UDM_ENV Conf;
63 
64 extern unsigned int seconds;   /* To sleep between documents   */
65 extern int flags;              /* For indexer                  */
66 extern int total_threads;      /* Total threads number         */
67 extern int sleep_threads;      /* Number of sleepping threads  */
68 extern int max_index_time;
69 extern int maxthreads;
70 extern UDM_CRAWLER *ThreadCrawlers;
71 extern int thd_errors;
72 
73 #ifdef HAVE_PTHREAD
74 static udm_thread_t *threads= NULL;
75 
76 #ifdef WIN32
77 #include <time.h>
78 static struct tm*
localtime_r(const time_t * clock,struct tm * result)79 localtime_r(const time_t *clock, struct tm *result)
80 {
81   *result= *localtime(clock);
82   return result;
83 }
84 #define strptime(x, y, z) (0)
85 #endif /* WIN32 */
86 #endif /* HAVE_PTHREAD */
87 
88 
89 /* CallBack function for log information */
90 #ifdef WIN32
UdmShowInfo(UDM_AGENT * A,const char * state,const char * str)91 void UdmShowInfo(UDM_AGENT* A, const char *state, const char* str)
92 {
93   printf("%d %s %s\n", A ? A->handle : 0,state,str);
94 }
95 #else
96 extern UDM_API(void) UdmShowInfo(UDM_AGENT* A, const char *state, const char* str);
97 #endif /* WIN32 */
98 
99 
100 static const char *execsql= NULL;
101 static char *
sqlexecgets(UDM_IOHANDLER * iohandler,char * str,size_t len)102 sqlexecgets(UDM_IOHANDLER *iohandler, char *str, size_t len)
103 {
104   UDM_SQLMON_PARAM *prm= (UDM_SQLMON_PARAM*) iohandler->user_data;
105   if (!execsql)
106     return NULL;
107   udm_snprintf(str, len, "%s", execsql);
108   prm->flags|= UDM_SQLMON_DONT_NEED_SEMICOLON;
109   execsql= NULL;
110   return str;
111 }
112 
113 
sqlmongets(UDM_IOHANDLER * iohandler,char * str,size_t size)114 static char* sqlmongets(UDM_IOHANDLER *iohandler, char *str, size_t size)
115 {
116   UDM_SQLMON_PARAM *prm= (UDM_SQLMON_PARAM*) iohandler->user_data;
117 #ifdef HAVE_READLINE
118   if (isatty(0))
119   {
120      char prompt[]="SQL>";
121      char *line= readline(prompt);
122      if (!line)
123        return 0;
124 
125      if (*line) add_history(line);
126      /* We need "\n" at the end to make sqlmon work properly */
127      udm_snprintf(str, size, "%s\n", line);
128   }
129   else
130 #endif
131   {
132     if (loglevel >= UDM_LOG_INFO)
133       prm->iohandler.prompt(&prm->iohandler, UDM_MSG_PROMPT, "SQL>");
134     if (!fgets(str, size, stdin))
135       return 0;
136   }
137   return str;
138 }
139 
140 
141 static udm_rc_t
sqlmonprompt(UDM_IOHANDLER * iohandler,udm_msg_t msgtype,const char * msg)142 sqlmonprompt(UDM_IOHANDLER *iohandler, udm_msg_t msgtype, const char *msg)
143 {
144   UDM_SQLMON_PARAM *prm= (UDM_SQLMON_PARAM*) iohandler->user_data;
145   switch (msgtype)
146   {
147     case UDM_MSG_ERROR:
148       if (prm->mode == udm_sqlmon_mode_batch)
149         fprintf(stdout, "ERROR at line %d: %s\n", (int) prm->lineno + 1, msg);
150       else
151         fprintf(stdout, "ERROR: %s\n", msg);
152       break;
153     case UDM_MSG_INFO:
154       if (loglevel >= UDM_LOG_INFO)
155         fprintf(stdout, "%s\n", msg);
156       break;
157     case UDM_MSG_COMMAND:
158       if (loglevel >= UDM_LOG_INFO)
159         fprintf(stdout, "'%s'\n", msg);
160       break;
161     case UDM_MSG_EOL:
162       fprintf(stdout, "\n");
163       break;
164     case UDM_MSG_DATA:
165     case UDM_MSG_PROMPT:
166       fprintf(stdout, "%s", msg);
167       break;
168   }
169   return UDM_OK;
170 }
171 
172 
173 static udm_rc_t
UdmIndCreate(UDM_AGENT * Agent)174 UdmIndCreate(UDM_AGENT *Agent)
175 {
176   udm_rc_t rc;
177   if (UDM_OK != (rc= UdmDBAction(Agent, UDM_DBCMD_CREATE)))
178     UdmLog(Agent, UDM_LOG_ERROR, "Error: '%s'", UdmEnvErrMsg(Agent->Conf));
179   return rc;
180 }
181 
182 
183 static udm_rc_t
UdmIndDrop(UDM_AGENT * Agent)184 UdmIndDrop(UDM_AGENT *Agent)
185 {
186   udm_rc_t rc;
187   if (UDM_OK != (rc= UdmDBAction(Agent, UDM_DBCMD_DROP)))
188     UdmLog(Agent, UDM_LOG_ERROR, "Error: '%s'", UdmEnvErrMsg(Agent->Conf));
189   return rc;
190 }
191 
192 
193 static udm_rc_t
ShowStatistics(UDM_AGENT * Indexer,UDM_QUERY * Query)194 ShowStatistics(UDM_AGENT *Indexer, UDM_QUERY *Query)
195 {
196   udm_rc_t rc;
197   struct tm tm;
198   const char *stat_time;
199   char sbuf[32];
200   size_t snum;
201   UDM_STAT  Total;
202 
203   bzero((void*)&Total, sizeof(Total));
204   Query->StatList.time= time(NULL);
205   stat_time = UdmVarListFindStr(&Conf.Vars, "stat_time", "0");
206   bzero(&tm, sizeof(tm));
207 
208 #ifndef WIN32
209   if (stat_time &&
210       strlen(stat_time) >= 7 &&
211       stat_time[4] == '-' &&
212       (stat_time[7] == '-' || !stat_time[7]) &&
213       (strptime(stat_time, "%Y-%m-%d %H:%M:%S", &tm) ||
214        strptime(stat_time, "%Y-%m-%d %H:%M", &tm) ||
215        strptime(stat_time, "%Y-%m-%d %H:%M", &tm) ||
216        strptime(stat_time, "%Y-%m-%d %H", &tm) ||
217        strptime(stat_time, "%Y-%m-%d", &tm) ||
218        strptime(stat_time, "%Y-%m", &tm)))
219   {
220     Query->StatList.time = mktime(&tm);
221   }
222   else if (stat_time && (Query->StatList.time= Udm_dp2time_t(stat_time)) >= 0)
223   {
224     Query->StatList.time += time(NULL);
225     localtime_r(&Query->StatList.time, &tm);
226   }
227   else
228   {
229     Query->StatList.time = time(NULL);
230     localtime_r(&Query->StatList.time, &tm);
231   }
232 #else
233   {
234     struct tm *tm1;
235     Query->StatList.time= time(NULL);
236     tm= *(tm1= localtime(&Query->StatList.time));
237   }
238 #endif
239   if (UDM_OK != (rc= UdmQueryAction(Indexer, Query, UDM_QUERYCMD_STATISTICS)))
240   {
241     UdmLog(Indexer, UDM_LOG_ERROR, "Error: '%s'", UdmEnvErrMsg(Indexer->Conf));
242     goto ex;
243   }
244 
245   strftime(sbuf, sizeof(sbuf), "%Y-%m-%d %H:%M:%S", &tm);
246   printf("\n          Database statistics [%s]\n\n", sbuf);
247   printf("%10s %10s %10s\n","Status","Expired","Total");
248   printf("   -----------------------------\n");
249   for (snum= 0; snum < Query->StatList.nstats; snum++)
250   {
251     UDM_STAT *S= &Query->StatList.Stat[snum];
252     printf("%10d %10d %10d %s\n",S->status,S->expired,S->total,UdmHTTPErrMsg(S->status));
253     Total.expired+=S->expired;
254     Total.total+=S->total;
255   }
256   printf("   -----------------------------\n");
257   printf("%10s %10d %10d\n","Total",Total.expired,Total.total);
258   printf("\n");
259 
260 ex:
261   return rc;
262 }
263 
264 /* CallBack Func for Referers*/
UdmRefProc(int code,const char * url,const char * ref)265 static void UdmRefProc(int code, const char *url, const char * ref)
266 {
267      printf("%d %s %s\n",code,url,ref);
268 }
269 
270 static udm_rc_t
ShowReferers(UDM_AGENT * Indexer,UDM_QUERY * Query)271 ShowReferers(UDM_AGENT *Indexer, UDM_QUERY *Query)
272 {
273   return UdmQueryAction(Indexer, Query, UDM_QUERYCMD_REFERERS);
274 }
275 
276 
277 #undef THINFO_TEST
278 #ifdef THINFO_TEST
279 /* CallBack function for Thread information */
UdmShowThreadInfoProc(int handle,char * state,char * str)280 void UdmShowThreadInfoProc(int handle,char *state, char* str)
281 {
282   printf("%d %s %s\n",handle,state,str);
283 }
284 #endif
285 
286 
cmpgrp(const void * v1,const void * v2)287 static int cmpgrp(const void *v1, const void *v2)
288 {
289   int res;
290   UDM_CHARSET *c1= (UDM_CHARSET*) v1;
291   UDM_CHARSET *c2= (UDM_CHARSET*) v2;
292   if ((res = strcasecmp(UdmCsGroup(c1), UdmCsGroup(c2))))
293     return res;
294   return strcasecmp(c1->name,c2->name);
295 }
296 
display_charsets(FILE * file)297 static void display_charsets(FILE *file)
298 {
299   UDM_CHARSET *cs=NULL;
300   struct udm_cset_st c[100];
301   size_t i=0;
302   size_t n=0;
303   int family=-1;
304 
305   for(cs=UdmGetCharSetByID(0) ; cs && cs->name ; cs++)
306   {
307     /* Skip not compiled charsets */
308     if (cs->family != UDM_CHARSET_UNKNOWN)
309       c[n++]=*cs;
310   }
311   fprintf(file,"\n%d charsets available:\n", (int) n);
312 
313   UdmSort(c,n,sizeof(UDM_CHARSET),&cmpgrp);
314   for(i=0;i<n;i++)
315   {
316     if (family!=c[i].family)
317     {
318       fprintf(file, "\n%19s : ", UdmCsGroup(&c[i]));
319       family=c[i].family;
320     }
321     fprintf(file,"%s ",c[i].name);
322   }
323   fprintf(file,"\n");
324 }
325 
326 
usage(int level,UDM_CMDLINE_OPT * options)327 static int usage(int level, UDM_CMDLINE_OPT *options)
328 {
329   FILE *file= stdout;
330   fprintf(file, "\n");
331   fprintf(file, "indexer from %s-%s-%s\n", PACKAGE, VERSION, UDM_DBTYPE);
332   fprintf(file, "http://www.mnogosearch.org/ (C)1998-2015, LavTech Corp.\n");
333   fprintf(file, "\n");
334   fprintf(file, "Usage: indexer [OPTIONS]  [configfile]\n");
335 
336   UdmCmdLineOptionsPrint(options, stdout);
337 
338   fprintf(file, "\n");
339   fprintf(file, "\n");
340   fprintf(file, "Please post bug reports and suggestions at http://www.mnogosearch.org/bugs/\n");
341 
342 
343   if (level>1)display_charsets(file);
344   return(0);
345 }
346 
347 
348 /*
349   Load indexer.conf and check if any DBAddr were given
350 */
351 static udm_rc_t
UdmIndexerEnvLoad(UDM_AGENT * Indexer,const char * fname,int lflags)352 UdmIndexerEnvLoad(UDM_AGENT *Indexer, const char *fname,int lflags)
353 {
354   udm_rc_t rc;
355   if (UDM_OK == (rc= UdmEnvLoad(Indexer, fname, lflags)))
356   {
357     if (Indexer->Conf->DBList.nitems == 0)
358     {
359       sprintf(Indexer->Conf->errstr, "Error: '%s': No required DBAddr commands were specified", fname);
360       rc= UDM_ERROR;
361     }
362   }
363   return rc;
364 }
365 
366 
367 
368 /*
369   Parse command line
370 */
371 static int UdmARGC;
372 static char **UdmARGV;
373 
374 
375 
376 typedef struct
377 {
378   const char *url_filename;
379   udm_indcmd_t cmd;
380   udm_bool_t insert;
381   udm_bool_t expire;
382   udm_bool_t block;
383   udm_bool_t help;
384   udm_bool_t log2stderr;
385   udm_bool_t have_loglevel;
386   udm_bool_t warnings;
387 
388   int add_servers;
389   int add_server_urls;
390   int load_langmaps;
391   int load_spells;
392   int load_for_dump;
393 
394 } UDM_INDEXER_OPTIONS;
395 
396 
397 static void
UdmIndexerOptionsInit(UDM_INDEXER_OPTIONS * io)398 UdmIndexerOptionsInit(UDM_INDEXER_OPTIONS *io)
399 {
400   io->cmd=    UDM_IND_CRAWL;
401   io->insert= UDM_FALSE;
402   io->expire= UDM_FALSE;
403   io->block=  UDM_FALSE;
404   io->help=   UDM_FALSE;
405   io->url_filename= NULL;
406   io->log2stderr= UDM_TRUE;
407   io->have_loglevel= UDM_FALSE;
408 
409   io->add_servers=      UDM_FLAG_ADD_SERV;
410   io->add_server_urls=  UDM_FLAG_ADD_SERVURL;
411   io->load_langmaps=    UDM_FLAG_LOAD_LANGMAP;
412   io->load_spells=      UDM_FLAG_SPELL;
413   io->load_for_dump=    0;
414   io->warnings= UDM_TRUE;
415 }
416 
417 
418 static void
UdmFeaturesPrint(const char * mask)419 UdmFeaturesPrint(const char *mask)
420 {
421   UDM_VARLIST W;
422   size_t i;
423   UdmVarListInit(&W);
424   UdmFeatures(&W);
425   for (i= 0; i < W.nvars; i++)
426   {
427     const UDM_VAR *var= UdmVarListFindConstByIndex(&W, i);
428     if (!UdmWildCaseCmp(UdmVarName(var), mask))
429     {
430       UDM_CONST_STR strbuf, *str= UdmVarGetConstStr(var, &strbuf);
431       printf("%s:%.*s\n", UdmVarName(var), (int) str->length, str->str);
432     }
433   }
434   UdmVarListFree(&W);
435 }
436 
437 /*
438   Available new options:
439   Capital letters:  B    GH JK M OP   T VXWXYZ
440   Small   letters:           k             x z
441   Digits:          123456789
442 */
443 static int
UdmCmdLineHandleOption(void * user_data,const UDM_CMDLINE_OPT * opt,const char * value)444 UdmCmdLineHandleOption(void *user_data,
445                        const UDM_CMDLINE_OPT *opt, const char *value)
446 {
447   UDM_INDEXER_OPTIONS *io= (UDM_INDEXER_OPTIONS*) user_data;
448   switch (opt->id)
449   {
450     case 'F':
451     {
452       UdmFeaturesPrint(value);
453       exit(0);
454     }
455     case 'C':
456       io->cmd= UDM_IND_DELETE;
457       io->add_servers= io->load_langmaps= io->load_spells=0;
458       break;
459     case 'S':
460       io->cmd= UDM_IND_STAT;
461       io->add_servers= io->load_langmaps= io->load_spells= 0;
462       break;
463     case 'I':
464       io->cmd= UDM_IND_REFERERS;
465       io->add_servers= io->load_langmaps= io->load_spells= 0;
466       break;
467     case 'Q':
468       io->cmd= UDM_IND_SQLMON;
469       io->add_servers= io->load_langmaps= io->load_spells= 0;
470       break;
471     case UDM_IND_INDEX:
472     case UDM_IND_CRAWL:
473     case UDM_IND_CREATE:
474     case UDM_IND_DROP:
475     case UDM_IND_CHECKCONF:
476     case UDM_IND_CONVERT:
477     case UDM_IND_MULTI2BLOB:
478     case UDM_IND_EXPORT:
479     case UDM_IND_WRDSTAT:
480     case UDM_IND_REWRITEURL:
481     case UDM_IND_REWRITEPOP:
482     case UDM_IND_HASHSPELL:
483     case UDM_IND_DUMPSPELL:
484     case UDM_IND_REWRITELIMITS:
485     case UDM_IND_DUMPCONF:
486     case UDM_IND_DUMPDATA:
487     case UDM_IND_RESTOREDATA:
488       io->cmd= (udm_indcmd_t) opt->id;
489       break;
490     case UDM_IND_EXECSQL:
491       io->cmd= UDM_IND_SQLMON;
492       UdmVarListReplaceStr(&Conf.Vars, "exec", value);
493       io->add_servers= io->load_langmaps= io->load_spells= 0;
494       break;
495     case UDM_IND_SET0:
496       UDM_ASSERT(opt->name && opt->name[0]);
497       UdmVarListReplaceStr(&Conf.Vars, opt->name, value);
498       break;
499     case UDM_IND_SET:
500       {
501         const char *eq= strchr(value, '=');
502         char name[80];
503         if (eq && ((size_t)(eq - value)) < sizeof(name))
504         {
505           memcpy(name, value, eq - value);
506           name[eq-value]= '\0';
507           UdmVarListReplaceStr(&Conf.Vars, name, eq + 1);
508         }
509         else
510           UdmVarListReplaceInt(&Conf.Vars, value, 1);
511       }
512       break;
513 
514     case 'q':
515       if (io->add_server_urls == 0) /* -q already given */
516       {
517         /*
518           "indexer -qq" is given, do even faster start-up.
519           Don't synchonize "Server" commands in indexer.conf
520           with the "server" table content.
521         */
522         io->add_servers|= UDM_FLAG_DONT_ADD_TO_DB;
523       }
524       io->add_server_urls= 0;
525       break;
526     case 'l': io->log2stderr= UDM_FALSE; break;
527     case 'a': io->expire= UDM_TRUE; break;
528     case 'b': io->block= UDM_TRUE; break;
529     case 'e': flags|=UDM_FLAG_SORT_EXPIRED;break;
530     case 'o': flags|=UDM_FLAG_SORT_HOPS;break;
531     case 'r': flags|=UDM_FLAG_DONTSORT_SEED; break;
532     case 'm': flags|=UDM_FLAG_REINDEX;break;
533     case 'n': Conf.url_number=atoi(value);break;
534     case 'c': max_index_time=atoi(value);break;
535     case 'v': loglevel= atoi(value); io->have_loglevel= UDM_TRUE; break;
536     case 'p': seconds=atoi(value);break;
537     case 't': UdmVarListAddStr(&Conf.Vars,"tag" , value);break;
538     case 's': UdmVarListAddStr(&Conf.Vars, "status", value);break;
539     case 'y': UdmVarListAddStr(&Conf.Vars,"type", value);break;
540     case 'L': UdmVarListAddStr(&Conf.Vars,"lang", value);break;
541     case 'u': UdmVarListAddStr(&Conf.Vars,"u"   , value);
542       if (io->insert)
543       {
544         UDM_HREFPARAM HrefParam;
545         UdmHrefParamInit(&HrefParam);
546         HrefParam.link_source= UDM_LINK_SOURCE_CMDLINE;
547         UdmHrefListAddConst(&Conf.Hrefs, &HrefParam, value);
548       }
549       break;
550     case 'N':
551       maxthreads=atoi(value);
552       UdmVarListReplaceInt(&Conf.Vars, "CrawlerThreads", maxthreads);
553       UdmVarListReplaceInt(&Conf.Vars, "IndexerThreads", maxthreads);
554       break;
555     case 'f': io->url_filename= value; break;
556     case 'i': io->insert= UDM_TRUE; break;
557     case 'w': io->warnings= UDM_FALSE; break;
558     case 'j': UdmVarListAddStr(&Conf.Vars, "stat_time", value); break;
559     case 'd': strncpy(cname, value, sizeof(cname));
560       cname[sizeof(cname) - 1] = '\0';
561       break;
562     case 'D': UdmVarListAddStr(&Conf.Vars,"DBLimit"   , value); break;
563     case '?':
564     case 'h':
565     default:
566       io->help++;
567   }
568   return 0;
569 }
570 
571 
572 /*
573   Available new options:
574   Capital letters: AB    GH JK M OP   TUVXWXYZ
575   Small   letters:           k             x z
576   Digits:          123456789
577 */
578 static UDM_CMDLINE_OPT udm_indexer_options[]=
579 {
580   {-1, "",  UDM_OPT_TITLE,NULL, "\nCrawler options:"},
581   {'a', "", UDM_OPT_BOOL, NULL, "Revisit all documents even if not expired (can be\n"
582                                 "limited using -t, -u, -s, -c, -y and -f options)"},
583   {'m', "", UDM_OPT_BOOL, NULL, "Update expired documents even if not modified (can be\n"
584                                 "limited using -t, -u, -c, -s, -y and -f options)"},
585   {'e', "", UDM_OPT_BOOL, NULL, "Visit 'most expired' (oldest) documents first"},
586   {'o', "", UDM_OPT_BOOL, NULL, "Visit documents with less depth (hops value) first"},
587   {'r', "", UDM_OPT_BOOL, NULL, "Do not try to reduce remote servers load by randomising\n"
588                                 "crawler queue order (faster, but less polite)"},
589   {'n', "", UDM_OPT_INT,  NULL, "Visit only # documents and exit"},
590   {'c', "", UDM_OPT_INT,  NULL, "Visit only # seconds and exit"},
591   {'q', "", UDM_OPT_BOOL, NULL, "Quick startup (do not add Server URLs);  -qq even quicker"},
592   {'b', "", UDM_OPT_BOOL, NULL, "Block starting more than one indexer instances"},
593   {'i', "", UDM_OPT_BOOL, NULL, "Insert new URLs (URLs to insert must be given using -u or -f)"},
594   {'p', "", UDM_OPT_INT,  NULL, "Sleep # seconds after downloading every URL"},
595   {'w', "", UDM_OPT_BOOL, NULL, "Do not ask for confirmation when clearing documents\n"
596                                 "from the database (e.g.: indexer -Cw)"},
597   {'N', "", UDM_OPT_INT,  NULL, "Run # threads (for crawling or indexing)"},
598 
599 
600   {-1, "",  UDM_OPT_TITLE,NULL, "\nSubsection control options (can be combined):"},
601   {'s', "", UDM_OPT_STR,  NULL, "Limit indexer to documents matching status (HTTP Status code)"},
602   {'t', "", UDM_OPT_STR,  NULL, "Limit indexer to documents matching tag"},
603   {'y', "", UDM_OPT_STR,  NULL, "Limit indexer to documents matching content-type"},
604   {'L', "", UDM_OPT_STR,  NULL, "Limit indexer to documents matching language"},
605   {'u', "", UDM_OPT_STR,  NULL, "Limit indexer to documents with URLs matching pattern\n"
606                                 "(supports SQL LIKE wildcards '%' and '_')"},
607   {0, "seed",UDM_OPT_STR, NULL, "Limit indexer to documents with the given seed (0-255)"},
608   {'D', "", UDM_OPT_STR,  NULL, "Work with the n-th database only (i.e. with the n-th DBAddr)"},
609   {'f', "", UDM_OPT_STR,  NULL, "Read URLs to be visited/inserted/deleted from file (with -a\n"
610                                 "or -C option, supports SQL LIKE wildcard '%%'; has no effect\n"
611                                 "when combined with -m option)"},
612   {-1,  "", UDM_OPT_TITLE,NULL,
613   "  -f -            Use stdin instead of a file as an URL list"},
614 
615 
616   {-1,  "", UDM_OPT_TITLE,NULL, "\nLogging options:"},
617   {'l', "", UDM_OPT_BOOL, NULL, "Do not log to stdout/stderr"},
618   {'v', "", UDM_OPT_INT,  NULL, "Verbose level (0-5)"},
619 
620 
621   {-1, "",  UDM_OPT_TITLE,NULL, "\nMisc. options:"},
622   {'F', "", UDM_OPT_STR,  NULL, "Print compile configuration and exit (e.g.: indexer -F '*')"},
623   {'h',"help",UDM_OPT_BOOL, NULL,"Print help page and exit; -hh print more help"},
624   {'?', "", UDM_OPT_BOOL, NULL, "Print help page and exit; -?? print more help"},
625   {'d', "", UDM_OPT_STR,  NULL, "Use the given configuration file instead of indexer.conf"
626 #ifndef WIN32
627                                 "\nThis option is usefull when running indexer as an\n"
628                                 "interpreter, e.g.: #!/usr/local/sbin/indexer -d"
629 #endif
630   },
631   {'j', "", UDM_OPT_STR,  NULL, "Set current time for statistic (use with -S),\n"
632                                 "format: YYYY-MM[-DD[ HH[:MM[:SS]]]]\n"
633                                 "or time offset, e.g. 1d12h (see Period in indexer.conf)"},
634   {UDM_IND_SET,          "set",          UDM_OPT_STR, NULL, "Set variable"},
635   {-1, "",  UDM_OPT_TITLE,NULL, "\nCommands (can be used with subsection control options):"},
636   {UDM_IND_CRAWL,        "crawl",        UDM_OPT_BOOL,NULL, "Crawl (default command)"},
637   {UDM_IND_MULTI2BLOB,   "index",        UDM_OPT_BOOL,NULL, "Create search index"},
638   {UDM_IND_WRDSTAT,      "wordstat",     UDM_OPT_BOOL,NULL, "Create statistics for misspelled word suggestions"},
639   {UDM_IND_REWRITEURL,   "rewriteurl",   UDM_OPT_BOOL,NULL, "Rewrite URL data into the current search index"},
640   {UDM_IND_REWRITELIMITS,"rewritelimits",UDM_OPT_BOOL,NULL, "Recreate all Limit, UserScore, UserOrder data"},
641   {UDM_IND_REWRITEPOP,   "rewritepop",   UDM_OPT_BOOL,NULL, "Recreate popularity data"},
642   {UDM_IND_DELETE, /*C*/  "delete",      UDM_OPT_BOOL,NULL, "Delete documents from the database"},
643   {UDM_IND_STAT,   /*S*/ "statistics",   UDM_OPT_BOOL,NULL, "Print statistics and exit"},
644   {UDM_IND_REFERERS,/*I*/ "referers",    UDM_OPT_BOOL,NULL, "Print referers and exit "},
645 
646   {-1, "",  UDM_OPT_TITLE,NULL, "\nOther commands:"},
647 #ifdef HAVE_SQL /* TODO34 */
648   {UDM_IND_CREATE,       "create",       UDM_OPT_BOOL,NULL, "Create SQL table structure and exit"},
649   {UDM_IND_DROP,         "drop",         UDM_OPT_BOOL,NULL, "Drop SQL table structure and exit"},
650   {UDM_IND_SQLMON, /*Q*/ "sqlmon",       UDM_OPT_BOOL,NULL, "Run interactive SQL monitor"},
651   {UDM_IND_EXECSQL,      "exec",         UDM_OPT_STR, NULL, "Execute SQL query"},
652 #endif
653   {UDM_IND_CHECKCONF,    "checkconf",    UDM_OPT_BOOL,NULL, "Check configuration file for good syntax"},
654   {UDM_IND_EXPORT,       "export",       UDM_OPT_BOOL,NULL, NULL}, /* TODO */
655   {UDM_IND_HASHSPELL,    "hashspell",    UDM_OPT_BOOL,NULL, "Create hash files for the active Ispell dictionaries"},
656   {UDM_IND_DUMPSPELL,    "dumpspell",    UDM_OPT_BOOL,NULL, "Dump Ispell data for use with SQLWordForms"},
657   {UDM_IND_DUMPCONF,     "dumpconf",     UDM_OPT_BOOL,NULL, NULL},
658   {UDM_IND_DUMPDATA,     "dumpdata",     UDM_OPT_BOOL,NULL, "Dump collected data using SQL statements"},
659   {UDM_IND_RESTOREDATA,  "restoredata",  UDM_OPT_BOOL,NULL, "Load prevously dumped data (give a filename using -f)"},
660 
661   {UDM_IND_SET0, "fl", UDM_OPT_STR, NULL, NULL},
662   {0,NULL,0,NULL,NULL}
663 };
664 
665 
666 static int
UdmParseCmdLine(UDM_INDEXER_OPTIONS * io,int argc,char ** argv,size_t * noptions)667 UdmParseCmdLine(UDM_INDEXER_OPTIONS *io,
668                 int argc, char **argv, size_t *noptions)
669 {
670   return UdmCmdLineOptionsGet(io, argc, argv, udm_indexer_options,
671                               UdmCmdLineHandleOption, noptions);
672 }
673 
674 
675 /*
676 static int
677 UdmReloadEnv(UDM_AGENT *Indexer)
678 {
679   UDM_ENV   NewConf;
680   int  rc;
681 
682   UdmLog(Indexer,UDM_LOG_ERROR,"Reloading config '%s'",cname);
683   UdmEnvInit(&NewConf);
684   UdmSetLockProc(&NewConf,UdmLockProc);
685   UdmSetRefProc(&NewConf,UdmRefProc);
686 
687   Indexer->Conf = &NewConf;
688   rc = UdmIndexerEnvLoad(Indexer, cname, add_servers + load_langmaps + UDM_FLAG_SPELL);
689   Indexer->Conf = &Conf;
690 
691   if (rc!=UDM_OK)
692   {
693     UdmLog(Indexer,UDM_LOG_ERROR,"Can't load config: %s",UdmEnvErrMsg(&NewConf));
694     UdmLog(Indexer,UDM_LOG_ERROR,"Continuing with old config");
695     UdmEnvFree(&NewConf);
696   }
697   else
698   {
699     size_t noptions;
700     UdmEnvFree(&Conf);
701     Conf=NewConf;
702     UdmParseCmdLine(UdmARGC, UdmARGV, &noptions);
703 #ifndef WIN32
704     UdmOpenLog("indexer", &Conf, log2stderr);
705 #endif
706   }
707   return UDM_OK;
708 }
709 */
710 
711 
712 static size_t
create_shared_info(UDM_AGENT * A,char * str,size_t len)713 create_shared_info(UDM_AGENT *A, char *str, size_t len)
714 {
715   UDM_ENV *Env= A->Conf;
716   size_t res= udm_snprintf(str, len,
717                           "Hrefs: %d,"
718                           "Targets: %d,"
719                           "Cookies: %d,"
720                           "Robots: %d,"
721                           "Hosts: %d,"
722                           "IPs: %d",
723                           (int) Env->Hrefs.nhrefs,
724                           (int) Env->Targets.num_rows,
725                           (int) Env->Cookies.nvars,
726                           (int) Env->Robots.nrobots,
727                           (int) Env->Hosts.nhost_addr,
728                           (int) Env->InAddr.nitems);
729   return res;
730 }
731 
732 
733 static udm_rc_t
httpd_client_handler(int client,UDM_AGENT * A)734 httpd_client_handler(int client, UDM_AGENT *A)
735 {
736   char request[4096];
737   char response[1024];
738   char speed_info[128]= "";
739   char shared_info[128]= "";
740   ssize_t nrecv;
741   size_t i, len, total_docs= 0, total_sec= 0;
742   udm_uint8 total_bytes= 0;
743   time_t now= time(0);
744 
745   nrecv= recv(client, request, sizeof(request), 0);
746   UdmLog(A, UDM_LOG_ERROR, "Received request len=%d", (int) nrecv);
747   udm_snprintf(response, sizeof(response) - 1,
748                "HTTP/1.0 200 OK\r\nContent-Type: text/html\r\n\r\n");
749   UdmSend(client, response, strlen(response), 0);
750   len= sprintf(response,
751                "Threads:"
752                "<table border=1 cellspacing=1 cellpadding=1>\n"
753                "<tr><th>ID</th>"
754                "<th>Docs</th>"
755                "<th>Size</th>"
756                "<th>Task</th>"
757                "<th>Time</th>"
758                "<th>Param</th>"
759                "<th>Extra</th></tr>");
760   UdmSend(client, response, len, 0);
761   for (i= 0; i < (size_t) maxthreads; i++)
762   {
763     UDM_AGENT *Tmp= &ThreadCrawlers[i].Agent;
764     char mutex_owned_info[64]= "", *mi;
765     size_t sec;
766     udm_mutexno_t mutex;
767 
768     for (mutex= 0, mi= mutex_owned_info;
769          mutex < UDM_LOCK_MAX;
770          mutex++)
771     {
772       mi+= UdmMutexStatePrint(mi, sizeof(mutex_owned_info) - (mi - mutex_owned_info),
773                               Tmp, mutex);
774     }
775 
776     len= sprintf(response,
777                  "<tr><td>%d</td>"
778                  "<td align=right>%d</td>"
779                  "<td align=right>%llu</td>"
780                  "<td>%s</td>"
781                  "<td align=right>%d</td>"
782                  "<td>%s&nbsp;</td>"
783                  "<td>%s%s&nbsp;</td></tr>\n",
784                  Tmp->handle,
785                  (int) Tmp->ndocs,
786                  (unsigned long long) Tmp->nbytes,
787                  Tmp->State.task,
788                  (int) (now - Tmp->State.start_time),
789                  UDM_NULL2EMPTY(Tmp->State.param),
790                  UDM_NULL2EMPTY(Tmp->State.extra),
791                  mutex_owned_info);
792     UdmSend(client, response, len, 0);
793     total_docs+= Tmp->ndocs;
794     total_bytes+= Tmp->nbytes;
795     sec= (size_t) (now - Tmp->start_time);
796     if (sec > total_sec)
797       total_sec= sec;
798   }
799   if (total_sec)
800   {
801     udm_snprintf(speed_info, sizeof(speed_info) - 1,
802                  "%d seconds, %d docs/sec, %d bytes/sec",
803                  (int) total_sec,
804                  (int) (total_docs / total_sec),
805                  (int) (total_bytes / total_sec));
806   }
807 
808   len= sprintf(response,
809                  "<tr><td>&nbsp;</td>"
810                  "<td align=right>%d</td>"
811                  "<td align=right>%llu</td>"
812                  "<td>&nbsp;</td>"
813                  "<td align=right>&nbsp;</td>"
814                  "<td>%s&nbsp;</td>"
815                  "<td>&nbsp;</td></tr>\n",
816                  (int) total_docs,
817                  (unsigned long long) total_bytes,
818                  speed_info);
819   UdmSend(client, response, len, 0);
820   len= sprintf(response, "</table>\n");
821   UdmSend(client, response, len, 0);
822   len= create_shared_info(A, shared_info, sizeof(shared_info));
823   UdmSend(client, shared_info, len, 0);
824   return UDM_OK;
825 }
826 
827 
828 #ifdef  WIN32
thread_main_httpd(void * arg)829 unsigned int __stdcall thread_main_httpd(void *arg)
830 #else
831 static void* thread_main_httpd(void *arg)
832 #endif
833 {
834   UDM_AGENT *A= (UDM_AGENT*) arg;
835 #ifndef WIN32
836   UdmStartHTTPD(A, httpd_client_handler);
837 #endif
838   return 0;
839 }
840 
841 
842 
843 static char pidname[1024];
844 static char time_pid[100];
845 
exitproc(void)846 static void exitproc(void)
847 {
848   unlink(pidname);
849 }
850 
851 
time_pid_info(void)852 static char * time_pid_info(void)
853 {
854   struct tm * tim;
855   time_t t;
856   t= time(NULL);
857   tim= localtime(&t);
858   strftime(time_pid,sizeof(time_pid),"%a %d %H:%M:%S",tim);
859   sprintf(time_pid+strlen(time_pid)," [%d]",(int)getpid());
860   return(time_pid);
861 }
862 
863 
UdmWSAStartup(void)864 static void UdmWSAStartup(void)
865 {
866 #ifdef WIN32
867   WSADATA wsaData;
868   if (WSAStartup(0x101,&wsaData)!=0)
869   {
870     fprintf(stderr,"WSAStartup() error %d\n",WSAGetLastError);
871     exit(1);
872   }
873 #endif
874 }
875 
UdmWSACleanup(void)876 static void UdmWSACleanup(void)
877 {
878 #ifdef WIN32
879   WSACleanup();
880 #endif
881   return;
882 }
883 
UdmConfirm(const char * msg)884 static int UdmConfirm(const char *msg)
885 {
886   char str[5];
887   printf("%s",msg);
888   return (fgets(str,sizeof(str),stdin) && !strncmp(str,"YES",3));
889 }
890 
891 
892 static udm_rc_t
UdmIndDelete(UDM_AGENT * A,UDM_QUERY * Query,const UDM_INDEXER_OPTIONS * io)893 UdmIndDelete(UDM_AGENT *A, UDM_QUERY *Query, const UDM_INDEXER_OPTIONS *io)
894 {
895   int clear_confirmed=1;
896   udm_rc_t rc= UDM_OK;
897   if (io->warnings)
898   {
899     size_t i;
900     printf("You are going to delete content from the database(s):\n");
901     for (i = 0; i < Conf.DBList.nitems; i++)
902     {
903       UDM_DB *db= &Conf.DBList.Item[i];
904       char dbaddr[128];
905       size_t nbytes;
906       db->dbhandler->Info(db, dbaddr, sizeof(dbaddr), &nbytes, UDM_DBINFO_ADDR);
907       printf("%s\n", dbaddr);
908     }
909     clear_confirmed=UdmConfirm("Are you sure?(YES/no)");
910   }
911 
912   if (clear_confirmed)
913   {
914     if (io->url_filename)
915     {
916       rc= UdmURLFile(A, io->url_filename, UDM_URL_FILE_CLEAR);
917     }
918     else
919     {
920       rc= UdmQueryAction(A, Query, UDM_QUERYCMD_CLEAR);
921     }
922   }
923   else
924   {
925     printf("Canceled\n");
926   }
927   if (rc != UDM_OK)
928   {
929     fflush(stdout);
930     UdmLog(A, UDM_LOG_ERROR, "Error: '%s'", UdmEnvErrMsg(A->Conf));
931   }
932   return rc;
933 }
934 
935 static void
UdmPrintCumulativeStatistics(UDM_AGENT * Indexer,UDM_CRAWLER * Item,size_t nitems,time_t sec)936 UdmPrintCumulativeStatistics(UDM_AGENT *Indexer,
937                              UDM_CRAWLER *Item, size_t nitems, time_t sec)
938 {
939   size_t i;
940   unsigned int ndocs= 0;
941   unsigned long long nbytes= 0;
942   double M= 0.0, K= 0.0;
943 
944   for (i= 0; i < nitems; i++)
945   {
946     ndocs+= Item[i].Agent.ndocs;
947     nbytes+= Item[i].Agent.nbytes;
948   }
949   if (sec > 0)
950   {
951     /* Convert to int64 - conversion from uint64 to double doesn't work on windows */
952     M= ((udm_timer_t) nbytes) / 1048576.0 / sec;
953     if (M < 1.0) K= ((udm_timer_t) nbytes) / 1024.0 / sec;
954   }
955   UdmLog(Indexer, UDM_LOG_ERROR,
956          "Done (%d seconds, %u documents, %llu bytes, %5.2f %cbytes/sec.)",
957          (int) sec, (unsigned int) ndocs, (unsigned long long) nbytes,
958          (M < 1.0) ? K : M, (M < 1.0) ? 'K' : 'M' );
959 }
960 
961 
962 static udm_rc_t
UdmCrawl(UDM_AGENT * A,UDM_QUERY * Query)963 UdmCrawl(UDM_AGENT *A, UDM_QUERY *Query)
964 {
965   UDM_AGENT httpd_agent;
966   size_t nbytes;
967   const UDM_VAR *Listen;
968   time_t start_time= time(0);
969 
970   maxthreads= UdmVarListFindInt(&A->Conf->Vars, "CrawlerThreads", 1);
971   nbytes= maxthreads * sizeof(UDM_CRAWLER);
972   ThreadCrawlers= (UDM_CRAWLER*) UdmMalloc(nbytes);
973   bzero((void*) ThreadCrawlers, nbytes);
974   UdmAgentInit(&httpd_agent, A->Conf, 0);
975 
976   if ((Listen= UdmVarListFind(&A->Conf->Vars, "Listen")))
977   {
978 #ifdef HAVE_PTHREAD
979     if (UdmVarStr(Listen) && UdmVarStr(Listen)[0])
980     {
981       udm_thread_t httpd_thread;
982       UdmThreadCreate(&httpd_thread, thread_main_httpd, &httpd_agent);
983     }
984     else
985 #endif
986     {
987       UdmLog(A, UDM_LOG_ERROR, "Not starting HTTPD");
988     }
989   }
990 
991 
992 #ifdef HAVE_PTHREAD
993   {
994     int i;
995     for (i= 0; i < maxthreads; i++)
996     {
997       UdmAgentInit(&ThreadCrawlers[i].Agent, A->Conf, i + 1);
998       ThreadCrawlers[i].Query= Query;
999       ThreadCrawlers[i].Agent.flags= flags;
1000     }
1001   }
1002 
1003   {
1004     int i;
1005     nbytes= maxthreads * sizeof(udm_thread_t);
1006     threads= (udm_thread_t*) UdmMalloc(nbytes);
1007     bzero((void*) threads, nbytes);
1008     for(i= 0; i < maxthreads; i++)
1009     {
1010       if (seconds) UDMSLEEP(seconds);
1011 
1012       UdmThreadCreate(&threads[i], &UdmCrawlerMain, &ThreadCrawlers[i]);
1013 
1014       UDM_GETLOCK(A, UDM_LOCK_THREAD);
1015       total_threads= i + 1;
1016       UDM_RELEASELOCK(A,UDM_LOCK_THREAD);
1017     }
1018 #ifndef WIN32
1019     for (i = 0; i < maxthreads; i++)
1020       pthread_join(threads[i], NULL);
1021 #else
1022     while(1)
1023     {
1024       int num;
1025       UDM_GETLOCK(A,UDM_LOCK_THREAD);
1026       num=total_threads;
1027       UDM_RELEASELOCK(A,UDM_LOCK_THREAD);
1028       if (!num)break;
1029       UDMSLEEP(1);
1030     }
1031 #endif
1032     UDM_FREE(threads);
1033   }
1034 #else
1035   A->handle = 1;
1036   UdmCrawlerMain(A);
1037 #endif
1038 
1039   if (maxthreads > 1)
1040     UdmPrintCumulativeStatistics(A, ThreadCrawlers, maxthreads,
1041                                  time(0) - start_time);
1042   UdmAgentFree(&httpd_agent);
1043   UDM_FREE(ThreadCrawlers);
1044   return thd_errors ? UDM_ERROR : UDM_OK;
1045 }
1046 
1047 
1048 static int
UdmDumpDocument(UDM_AGENT * A,UDM_DOCUMENT * D)1049 UdmDumpDocument(UDM_AGENT *A, UDM_DOCUMENT *D)
1050 {
1051   size_t i;
1052   printf("  <item>\n");
1053   /*
1054   printf("    <link>%s</link>\n", UdmVarListFindStr(&D->Sections, "URL", ""));
1055   printf("    <id>%s</id>\n", UdmVarListFindStr(&D->Sections, "ID", ""));
1056   printf("    <content-length>%s</content-length>\n", UdmVarListFindStr(&D->Sections, "Content-Length", ""));
1057   printf("    <status>%s</status>\n", UdmVarListFindStr(&D->Sections, "Status", ""));
1058   printf("    <hops>%s</hops>\n", UdmVarListFindStr(&D->Sections, "Hops", ""));
1059   printf("    <crc32>%s</crc32>\n", UdmVarListFindStr(&D->Sections, "crc32", ""));
1060   printf("    <modified>%s</modified>\n", UdmVarListFindStr(&D->Sections, "Last-Modified", ""));
1061   */
1062   for (i= 0; i < D->Sections.nvars; i++)
1063   {
1064     const UDM_VAR *S= UdmVarListFindConstByIndex(&D->Sections, i);
1065     UDM_CONST_STR strbuf, *str= UdmVarGetConstStr(S, &strbuf);
1066     printf("    <%s>%.*s</%s>\n",
1067            UdmVarName(S), (int) str->length, str->str, UdmVarName(S));
1068   }
1069   printf("  </item>\n");
1070   return 0;
1071 }
1072 
1073 
1074 static udm_rc_t
UdmDumpDocuments(UDM_AGENT * A,UDM_QUERY * Query)1075 UdmDumpDocuments(UDM_AGENT *A, UDM_QUERY *Query)
1076 {
1077   udm_rc_t rc;
1078   A->Conf->DumpDoc= UdmDumpDocument;
1079 
1080   /* printf("<rss><channel>\n"); */
1081   if (UDM_OK != (rc= UdmQueryAction(A, Query, UDM_QUERYCMD_DUMPDATA)))
1082     return rc;
1083   /* printf("</channel></rss>\n"); */
1084   return UDM_OK;
1085 }
1086 
1087 
1088 #include <sys/stat.h>
1089 /*
1090   TODO for dump/restore:
1091   - Escape bad characters to entities on dump, unescape on restore.
1092   - Huge file size
1093   - Subsection control
1094   - Database number limit
1095   - Insert into database
1096 */
1097 static udm_rc_t
UdmRestoreDocuments(UDM_AGENT * A,const UDM_INDEXER_OPTIONS * io)1098 UdmRestoreDocuments(UDM_AGENT *A, const UDM_INDEXER_OPTIONS *io)
1099 {
1100   udm_rc_t rc= UDM_OK;
1101   UDM_DSTR dstr;
1102   struct stat sb;
1103   int fd;
1104   UDM_CHARSET *cs;
1105 
1106   if (!io->url_filename)
1107   {
1108     fprintf(stderr, "Required option -f ommitted\n");
1109     return UDM_ERROR;
1110   }
1111   fprintf(stderr, "Restoring data from '%s'\n", io->url_filename);
1112 
1113   if (stat(io->url_filename, &sb))
1114   {
1115     fprintf(stderr, "Can't stat '%s'\n", io->url_filename);
1116     return UDM_ERROR;
1117   }
1118   UdmDSTRInit(&dstr, 1024);
1119   if ((fd= open(io->url_filename, O_RDONLY|UDM_BINARY)) <= 0)
1120   {
1121     rc= UDM_ERROR;
1122     fprintf(stderr, "Can't open '%s'", io->url_filename);
1123     goto ex;
1124   }
1125 
1126   if (UDM_OK != (rc= UdmDSTRReadFile(&dstr, fd, sb.st_size)))
1127   {
1128     rc= UDM_ERROR;
1129 #ifndef WIN32
1130     fprintf(stderr, "Failed to read %lld bytes from '%s'\n",
1131             (long long) sb.st_size, io->url_filename);
1132 #endif
1133     goto ex;
1134   }
1135 
1136   cs= &udm_charset_latin1;
1137   rc= UdmQueryFromXML(A, NULL, UdmDSTRPtr(&dstr), UdmDSTRLength(&dstr), cs);
1138 ex:
1139   UdmDSTRFree(&dstr);
1140   return rc;
1141 }
1142 
1143 
1144 static udm_rc_t
UdmIndHashSpell(UDM_AGENT * Agent)1145 UdmIndHashSpell(UDM_AGENT *Agent)
1146 {
1147   UDM_ENV *Env= Agent->Conf;
1148   char errmsg[256];
1149   udm_rc_t rc;
1150   if (UDM_OK != (rc= UdmSpellListListLoad(&Env->Spells, errmsg, sizeof(errmsg)))||
1151       UDM_OK != (rc= UdmSpellListListWriteHash(&Env->Spells, errmsg, sizeof(errmsg))))
1152   {
1153     fprintf(stderr, "error: %s\n", errmsg);
1154   }
1155   return rc;
1156 }
1157 
1158 
1159 static udm_rc_t
UdmIndDumpSpell(UDM_AGENT * Agent)1160 UdmIndDumpSpell(UDM_AGENT *Agent)
1161 {
1162   char errmsg[256];
1163   int spflags= UDM_SPELL_NOPREFIX;
1164   UDM_ENV *Env= Agent->Conf;
1165   udm_rc_t rc;
1166   if (UDM_OK != (rc= UdmSpellListListLoad(&Env->Spells,
1167                                           errmsg, sizeof(errmsg)))||
1168       UDM_OK != (rc= UdmAffixListListLoad(&Env->Affixes, spflags,
1169                                           errmsg, sizeof(errmsg)))||
1170       UDM_OK != (rc= UdmSpellDump(&Env->Spells, &Env->Affixes,
1171                                           errmsg, sizeof(errmsg))))
1172   {
1173     fprintf(stderr, "error: %s\n", errmsg);
1174   }
1175   return rc;
1176 }
1177 
1178 
1179 static udm_rc_t
UdmIndSQLMonitor(UDM_AGENT * Agent,UDM_INDEXER_OPTIONS * io)1180 UdmIndSQLMonitor(UDM_AGENT *Agent, UDM_INDEXER_OPTIONS *io)
1181 {
1182   udm_rc_t rc;
1183   UDM_SQLMON_PARAM prm;
1184   bzero((void*)&prm,sizeof(prm));
1185   prm.flags= loglevel > 0 ? UDM_SQLMON_DISPLAY_FIELDS : 0;
1186   execsql= UdmVarListFindStr(&Conf.Vars, "exec", NULL);
1187   prm.iohandler.gets= execsql ? sqlexecgets : sqlmongets;
1188   prm.iohandler.prompt= sqlmonprompt;
1189   prm.iohandler.user_data= &prm;
1190 #ifdef HAVE_READLINE
1191   if (isatty(0))
1192     prm.mode= udm_sqlmon_mode_interactive;
1193 #endif
1194   rc= UdmSQLMonitor(Agent, &Conf, &prm);
1195   if (prm.iohandler.gets == sqlmongets)
1196   {
1197     /* Display "\n" after the last ">SQL" prompt */
1198     sqlmonprompt(&prm.iohandler, UDM_MSG_EOL, "");
1199   }
1200   return rc;
1201 }
1202 
1203 
1204 static udm_rc_t
UdmIndBlock(void)1205 UdmIndBlock(void)
1206 {
1207   int pid_fd;
1208   char pidbuf[128];
1209   /* Check that another instance isn't running and create PID file. */
1210   const char *vardir= UdmVarListFindStr(&Conf.Vars,"VarDir",UDM_VAR_DIR);
1211   sprintf(pidname,"%s/%s", vardir ,"indexer.pid");
1212   pid_fd = open(pidname,O_CREAT|O_EXCL|O_WRONLY,0644);
1213   if (pid_fd < 0)
1214   {
1215     fprintf(stderr,"%s Can't create '%s': %s\n", time_pid_info(), pidname, strerror(errno));
1216     if (errno == EEXIST)
1217     {
1218       fprintf(stderr,"It seems that another indexer is already running!\n");
1219       fprintf(stderr,"Remove '%s' if it is not true.\n",pidname);
1220     }
1221     return UDM_ERROR;
1222   }
1223   udm_snprintf(pidbuf, sizeof(pidbuf), "%d\n", (int)getpid());
1224   write(pid_fd, &pidbuf, strlen(pidbuf));
1225 #ifdef HAVE_ATEXIT
1226   atexit(&exitproc);
1227 #endif
1228   return UDM_OK;
1229 }
1230 
1231 
1232 static udm_rc_t
UdmIndInsertFromFile(UDM_AGENT * Agent,UDM_INDEXER_OPTIONS * io)1233 UdmIndInsertFromFile(UDM_AGENT *Agent, UDM_INDEXER_OPTIONS *io)
1234 {
1235   udm_rc_t rc= UDM_OK;
1236   if (strcmp(io->url_filename,"-"))
1237   {
1238     /* Make sure all URLs to be inserted are OK */
1239     if (UDM_OK != (rc= UdmURLFile(Agent, io->url_filename, UDM_URL_FILE_PARSE)))
1240     {
1241       UdmLog(Agent, UDM_LOG_ERROR,"Error: Invalid URL in '%s'", io->url_filename);
1242       goto ex;
1243     }
1244   }
1245   if (UDM_OK != (rc= UdmURLFile(Agent, io->url_filename, UDM_URL_FILE_INSERT)))
1246   {
1247     UdmLog(Agent, UDM_LOG_ERROR,"Error: '%s'", UdmEnvErrMsg(Agent->Conf));
1248     goto ex;
1249   }
1250 ex:
1251   return rc;
1252 }
1253 
1254 
1255 static udm_rc_t
UdmIndExpire(UDM_AGENT * Agent,UDM_QUERY * Query,const UDM_INDEXER_OPTIONS * io)1256 UdmIndExpire(UDM_AGENT *Agent, UDM_QUERY *Query, const UDM_INDEXER_OPTIONS *io)
1257 {
1258   udm_rc_t rc= io->url_filename ?
1259                UdmURLFile(Agent, io->url_filename, UDM_URL_FILE_REINDEX) :
1260                UdmQueryAction(Agent, Query, UDM_QUERYCMD_EXPIRE);
1261   return rc;
1262 }
1263 
1264 
1265 static udm_rc_t
UdmIndCrawl(UDM_AGENT * Agent,UDM_QUERY * Query,const UDM_INDEXER_OPTIONS * io)1266 UdmIndCrawl(UDM_AGENT *Agent, UDM_QUERY *Query, const UDM_INDEXER_OPTIONS *io)
1267 {
1268   udm_rc_t rc;
1269   if (io->block && UDM_OK != (rc= UdmIndBlock()))
1270     goto ex;
1271   UdmLog(Agent, UDM_LOG_WARN, "indexer from %s-%s-%s started with '%s'", PACKAGE, VERSION, UDM_DBTYPE, cname);
1272   UdmSigHandlersInit(Agent);
1273 
1274   if (io->expire)
1275   {
1276     if (UDM_OK != (rc= UdmIndExpire(Agent, Query, io)))
1277       goto ex;
1278   }
1279 
1280   if (UDM_OK != (rc= UdmCrawl(Agent, Query)))
1281     goto ex;
1282 
1283 ex:
1284   return rc;
1285 }
1286 
1287 
main(int argc,char ** argv)1288 int main(int argc, char **argv)
1289 {
1290   UDM_INDEXER_OPTIONS io;
1291   UDM_AGENT Main;
1292   UDM_QUERY Query;
1293   udm_rc_t rc= UDM_OK;
1294   char  *REQUEST_METHOD= getenv("REQUEST_METHOD");
1295   FILE  *logfile= REQUEST_METHOD ? stdout : stderr;
1296   size_t noptions;
1297 #ifdef CHASEN
1298   char  *chasen_argv[] = { "chasen", "-b", "-f", "-F", "%m ", NULL };
1299   chasen_getopt_argv(chasen_argv, NULL);
1300 #endif
1301 
1302   UdmIndexerOptionsInit(&io);
1303 
1304   if (REQUEST_METHOD)
1305     printf("Content-Type: text/plain\r\n\r\n");
1306 
1307   UdmWSAStartup();
1308 
1309   UdmInit(); /* Initialize library */
1310 
1311   UdmInitMutexes();
1312   UdmEnvInit(&Conf);
1313   UdmVarListAddEnviron(&Conf.Vars,"ENV");
1314   UdmSetLockProc(&Conf,UdmLockProc);
1315 
1316 #if defined(HAVE_PTHREAD) && !defined(WIN32)
1317   Conf.THDHandler.ThreadCreate= UdmThreadCreate;
1318   Conf.THDHandler.ThreadJoin= UdmThreadJoin;
1319 #endif
1320 
1321   UdmSetRefProc(&Conf,UdmRefProc);
1322 #ifdef THINFO_TEST
1323   UdmSetThreadProc(&Conf,UdmShowThreadInfoProc);
1324 #endif
1325   UdmAgentInit(&Main,&Conf,0);
1326   UdmQueryInit(&Query);
1327 
1328   UdmARGC= argc;
1329   UdmARGV= argv;
1330 
1331   if (UdmParseCmdLine(&io, UdmARGC, UdmARGV, &noptions))
1332     goto ex;
1333 
1334   if (io.cmd == UDM_IND_INDEX)
1335   {
1336     fprintf(stderr, "\nWARNING: \"indexer -Eindex\" is deprecated. "
1337                     "Use \"indexer -Ecrawl\" instead!\n\n");
1338     io.cmd= UDM_IND_CRAWL;
1339   }
1340 
1341   if (io.cmd == UDM_IND_AMBIGUOUS)
1342   {
1343     fprintf(stderr, "Ambiguous indexer command in -E\n");
1344     io.help++;
1345   }
1346 
1347   if (io.cmd == UDM_IND_UNKNOWN)
1348   {
1349     fprintf(stderr, "Unknown indexer command in -E\n");
1350     io.help++;
1351   }
1352 
1353   if (io.cmd == UDM_IND_DUMPCONF)
1354   {
1355     io.load_for_dump|= UDM_FLAG_DONT_ADD_TO_DB;
1356     io.load_langmaps= 0;
1357     io.load_spells= 0;
1358   }
1359   else if (io.cmd != UDM_IND_CRAWL)
1360   {
1361     if (io.cmd != UDM_IND_MULTI2BLOB && /* TODO34: rename this to UDM_IND_INDEX */
1362         io.cmd != UDM_IND_REWRITEPOP)
1363       io.add_servers= 0;
1364     else
1365     {
1366       io.add_servers|= UDM_FLAG_DONT_ADD_TO_DB;
1367       io.add_server_urls= 0;
1368     }
1369     io.load_langmaps= 0;
1370     if (io.cmd != UDM_IND_HASHSPELL && io.cmd  != UDM_IND_DUMPSPELL)
1371       io.load_spells= 0;
1372   }
1373 
1374   flags|= io.add_servers;
1375   flags |= io.add_server_urls;
1376   Main.flags= flags;
1377 
1378   argc-= noptions;
1379   argv+= noptions;
1380 
1381   if (argc > 1 || io.help)
1382   {
1383     usage(io.help, udm_indexer_options);
1384     rc= UDM_ERROR;
1385     goto ex;
1386   }
1387 
1388   if (!*cname)
1389   {
1390     if (argc == 1)
1391     {
1392       strncpy(cname,argv[0],sizeof(cname));
1393       cname[sizeof(cname)-1]='\0';
1394     }
1395     else
1396     {
1397       const char *cd=UdmVarListFindStr(&Conf.Vars,"ConfDir",UDM_CONF_DIR);
1398       udm_snprintf(cname,sizeof(cname),"%s%s%s",cd,UDMSLASHSTR,"indexer.conf");
1399       cname[sizeof(cname)-1]='\0';
1400     }
1401   }
1402 
1403 
1404   if (UDM_OK != (rc= UdmIndexerEnvLoad(&Main, cname,
1405                                        io.add_servers + io.load_langmaps +
1406                                        io.load_spells +
1407                                        io.add_server_urls + io.load_for_dump)))
1408   {
1409     fprintf(logfile, "%s\n", UdmEnvErrMsg(&Conf));
1410     goto ex;
1411   }
1412 
1413 
1414 #ifdef HAVE_LOCALE_H
1415   /*
1416     Needed for Mimer to make non-ascii Latin1 characters work.
1417     The locale should typically be set to en_US.iso88591
1418   */
1419   if (UdmVarListFind(&Conf.Vars, "Locale"))
1420     setlocale(LC_ALL, UdmVarListFindStr(&Conf.Vars, "Locale", "C"));
1421 #endif
1422 
1423   if (io.url_filename && strcmp(io.url_filename,"-"))
1424   {
1425     /* Make sure URL file is readable if not STDIN */
1426     FILE *url_file;
1427     if (!(url_file= fopen(io.url_filename,"r")))
1428     {
1429       UdmLog(&Main, UDM_LOG_ERROR,
1430              "Error: can't open url file '%s': %s",
1431              io.url_filename, strerror(errno));
1432       goto ex;
1433     }
1434     fclose(url_file);
1435   }
1436 
1437 
1438 
1439   if (io.cmd == UDM_IND_DUMPCONF)
1440   {
1441     if (UDM_OK != (rc= UdmEnvSave(&Main, "-", 0)))
1442       fprintf(logfile, "%s\n", UdmEnvErrMsg(&Conf));
1443     goto ex;
1444   }
1445 
1446 
1447   if (io.cmd == UDM_IND_CHECKCONF)
1448   {
1449     rc= 0;
1450     goto ex;
1451   }
1452 
1453 
1454   UdmEnvSetLogLevel(Main.Conf, io.have_loglevel ? loglevel :
1455                     UdmVarListFindInt(&Main.Conf->Vars, "LogLevel", UDM_LOG_INFO));
1456 #ifndef WIN32
1457   UdmOpenLog("indexer", &Conf, io.log2stderr);
1458 #endif
1459 
1460   if (io.insert && io.url_filename)
1461   {
1462     if (UDM_OK != UdmIndInsertFromFile(&Main, &io))
1463     {
1464       UdmLog(&Main,UDM_LOG_ERROR,"Error: '%s'",UdmEnvErrMsg(Main.Conf));
1465       goto ex;
1466     }
1467   }
1468 
1469   switch (io.cmd)
1470   {
1471     case UDM_IND_HASHSPELL:     rc= UdmIndHashSpell(&Main);     goto ex;
1472     case UDM_IND_DUMPSPELL:     rc= UdmIndDumpSpell(&Main);     goto ex;
1473     case UDM_IND_DUMPDATA:      rc= UdmDumpDocuments(&Main, &Query); goto ex;
1474     case UDM_IND_RESTOREDATA:   rc= UdmRestoreDocuments(&Main, &io); goto ex;
1475     case UDM_IND_SQLMON:        rc= UdmIndSQLMonitor(&Main, &io);    goto ex;
1476     case UDM_IND_EXECSQL:       rc= UdmIndSQLMonitor(&Main, &io);    goto ex;
1477     case UDM_IND_MULTI2BLOB:    rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_INDEX);  goto ex;
1478     case UDM_IND_EXPORT:        rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_EXPORT); goto ex;
1479     case UDM_IND_WRDSTAT:       rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_WORDSTAT); goto ex;
1480     case UDM_IND_REWRITEURL:    rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_REWRITE_URLDATA);    goto ex;
1481     case UDM_IND_REWRITELIMITS: rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_REWRITE_LIMITS);     goto ex;
1482     case UDM_IND_REWRITEPOP:    rc= UdmQueryAction(&Main, &Query, UDM_QUERYCMD_REWRITE_POPULARITY); goto ex;
1483     case UDM_IND_STAT:          rc= ShowStatistics(&Main, &Query); goto ex;
1484     case UDM_IND_REFERERS:      rc= ShowReferers(&Main, &Query);goto ex;
1485     case UDM_IND_CREATE:        rc= UdmIndCreate(&Main);        goto ex;
1486     case UDM_IND_DROP:          rc= UdmIndDrop(&Main);          goto ex;
1487     case UDM_IND_DELETE:        rc= UdmIndDelete(&Main, &Query, &io);   goto ex;
1488     case UDM_IND_CRAWL:         rc= UdmIndCrawl(&Main, &Query, &io);    goto ex;
1489     case UDM_IND_INDEX:         rc= UdmIndCrawl(&Main, &Query, &io);    goto ex;
1490 
1491     case UDM_IND_AMBIGUOUS:
1492     case UDM_IND_UNKNOWN:
1493     case UDM_IND_CHECKCONF:
1494     case UDM_IND_CONVERT:
1495     case UDM_IND_DUMPCONF:
1496     case UDM_IND_SET:
1497     case UDM_IND_SET0:
1498       break;
1499   }
1500 
1501 
1502 ex:
1503   total_threads= 0;
1504   UdmAgentFree(&Main);
1505   UdmQueryFree(&Query);
1506   UdmEnvFree(&Conf);
1507   UdmDestroyMutexes();
1508   UdmWSACleanup();
1509 #ifndef HAVE_ATEXIT
1510   exitproc();
1511 #endif
1512   return rc == UDM_OK ? 0 : 1;
1513 }
1514