1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #ifndef _UDM_SEARCH_TOOL_H
19 #define _UDM_SEARCH_TOOL_H
20 
21 #define UDM_FAST_PRESORT_DOCS	300
22 #define UDM_DEFAULT_USER_WORD_WEIGHT 256
23 
24 typedef unsigned long long udm_found_word_bitmask_t;
25 
26 
27 typedef struct
28 {
29   udm_found_word_bitmask_t mask;
30   uint4 quality;
31 } UDM_SNIPPETCHUNK_STAT;
32 
33 void UdmSnippetChunkStatInit(UDM_SNIPPETCHUNK_STAT *);
34 
35 typedef struct
36 {
37   UDM_CONV src_uni;
38   UDM_CONV uni_dst;
39   UDM_CONV uni_wcs;
40   int hlstop;
41   int segmenter;
42 } UDM_HIGHLIGHT_CONV;
43 
44 
45 typedef struct udm_search_query_param_st
46 {
47   double NumWordFactor;
48   unsigned int NumSections;
49   unsigned int MinCoordFactor;
50   unsigned int MaxCoordFactor;
51   unsigned int Phrase2CountFactor;
52   unsigned int Phrase3CountFactor;
53   unsigned int WordFormFactor;
54   unsigned int WordDensityFactor;
55   unsigned int SkipWordDistanceThreshold;
56   unsigned int WordDistanceWeight;
57   unsigned int IDFFactor;
58   urlid_t DebugURLId;
59   unsigned int SingleWordDistance;
60   unsigned int NumDistinctWordFactor;
61   int UserScoreFactor;
62   int PopularityFactor;
63   int RelevancyFactor;
64   int DateFactor;
65   udm_bool_t SaveSectionSize;
66   udm_bool_t NewVersion;
67   char wf[256];
68   char nwf[256];
69   unsigned int nwf_num;
70 } UDM_QUERY_PARAM;
71 
72 void UdmQueryParamInit(UDM_QUERY_PARAM *prm, UDM_ENV *Env, UDM_VARLIST *dbvars);
73 
74 
75 /* Functions form urldata.c */
76 void UdmURLDataFree(UDM_URLDATA *D);
77 void UdmURLDataListInit(UDM_URLDATALIST *L);
78 void UdmURLDataSortBySite(UDM_URLDATALIST *L);
79 void UdmURLDataSortByPattern(UDM_URLDATALIST *D, const char *pattern);
80 void UdmURLDataListSort(UDM_URLDATALIST *L);
81 UDM_URLDATA *UdmURLDataListSearch(UDM_URLDATALIST *List, urlid_t id);
82 void UdmURLDataListFreeItems(UDM_URLDATALIST *List, size_t first, size_t last);
83 void UdmURLDataListFree(UDM_URLDATALIST *List);
84 void UdmURLDataListClearParams(UDM_URLDATALIST *List);
85 size_t UdmURLDataCompact(UDM_URLDATA *dst, UDM_URLDATA *src, size_t n);
86 
87 /* Functions from groupby.c */
88 void UdmURLDataGroupBySite(UDM_URLDATALIST *L);
89 udm_rc_t UdmURLDataListGroupBySiteUsingSort(UDM_AGENT *A,
90                                             UDM_URLDATALIST *R,
91                                             UDM_DB *db);
92 void UdmURLDataApplySiteRank(UDM_AGENT *A, UDM_URLDATALIST *DataList,
93                              int is_aggregation_point);
94 void UdmURLDataListApplyPopularity(UDM_AGENT *A,
95                                    UDM_URLDATALIST *URLData,
96                                    const UDM_QUERY_PARAM *prm);
97 
98 udm_rc_t UdmURLDataListPackSite(UDM_URLDATALIST *List, UDM_DSTR *dstr);
99 udm_rc_t UdmURLDataListUnpackSite(UDM_AGENT *A, UDM_URLDATALIST *List,
100                                   const UDM_CONST_STR *str_arg);
101 
102 /* Functions from distance.c */
103 typedef struct udm_distance_stat_st
104 {
105   uint4 sum;
106   uint4 num;
107   uint4 phrase2_count;
108   uint4 phrase3_count;
109 } UDM_WORD_DISTANCE_STAT;
110 
111 void UdmWordDistanceStatInit(UDM_WORD_DISTANCE_STAT *d);
112 
113 void CalcAverageWordDistance(size_t wf2_secno,
114                              UDM_COORD2 *phr, size_t num, size_t nuniq,
115                              UDM_WORD_DISTANCE_STAT *dist);
116 
117 /* Functions from score.c */
118 udm_rc_t UdmUserScoreListApplyToURLScoreList(UDM_AGENT *A,
119                                              UDM_URLSCORELIST *URLScoreList,
120                                              UDM_URL_INT4_LIST *UserScoreList,
121                                              const UDM_QUERY_PARAM *prm);
122 udm_rc_t UdmUserScoreListApplyToURLDataList(UDM_AGENT *A,
123                                             UDM_URLDATALIST *URLDataList,
124                                             UDM_URL_INT4_LIST *UserScoreList,
125                                             const UDM_QUERY_PARAM *prm);
126 
127 udm_rc_t
128 UdmURLDataListApplyRelevancyFactors(UDM_AGENT *Agent,
129                                     UDM_URLDATALIST *DataList,
130                                     const UDM_QUERY_PARAM *prm);
131 
132 void UdmURLScoreListSortByScore(UDM_URLSCORELIST *ScoreList);
133 void UdmURLScoreListSortByScoreThenURLTop(UDM_URLSCORELIST *ScoreList,
134                                                  size_t topcount);
135 void UdmURLScoreListSortByScoreThenURL(UDM_URLSCORELIST *ScoreList);
136 
137 void UdmGroupByURL2(UDM_AGENT *Agent, UDM_DB *db, UDM_QUERY *Query,
138                     const UDM_QUERY_PARAM *query_param,
139                     UDM_SEARCHSECTIONLIST *CoordList,
140                     UDM_URLSCORELIST *ScoreList);
141 
142 udm_rc_t UdmQueryPrepare(UDM_AGENT *query, UDM_QUERY *Query);
143 udm_rc_t UdmApplyFastLimit(UDM_URLCRDLIST *Coord, UDM_URLID_LIST *urls);
144 
145 /* Functions from wordinfo.c */
146 udm_rc_t UdmQueryWordInfo(UDM_QUERY *Query);
147 
148 
149 /* Functions from fuzzy.c */
150 udm_rc_t UdmAllForms(UDM_AGENT *Indexer, UDM_WIDEWORDLIST *result,
151                      UDM_WIDEWORD *uword);
152 
153 udm_rc_t UdmComplexSynonyms(UDM_AGENT *Indexer, UDM_WIDEWORDLIST *result);
154 
155 /* Functions from highlight.c */
156 void UdmExcerptConvInit(UDM_HIGHLIGHT_CONV *cnv,
157                         UDM_CHARSET *wcs,
158                         UDM_CHARSET *src,
159                         UDM_CHARSET *dst,
160                         int hlstop, int segmenter);
161 void UdmExcerptConvInitFromEnv(UDM_HIGHLIGHT_CONV *cnv,
162                                UDM_CHARSET *wcs,
163                                UDM_CHARSET *src,
164                                UDM_CHARSET *dst,
165                                UDM_ENV *Env);
166 size_t
167 UdmHlConvertExtWithConv(UDM_AGENT *A,
168                         UDM_DSTR *dstr,
169                         UDM_SNIPPETCHUNK_STAT *stat,
170                         UDM_WIDEWORDLIST *List,
171                         const char *src, size_t srclen,
172                         UDM_HIGHLIGHT_CONV *ec);
173 udm_rc_t
174 UdmVarListHlConvert(UDM_AGENT *A,
175                     UDM_WIDEWORDLIST *WWList,
176                     UDM_VARLIST *Vars,
177                     UDM_HIGHLIGHT_CONV *ec);
178 
179 udm_rc_t UdmQueryConvert(UDM_AGENT *A, UDM_QUERY *Query,
180                          UDM_CHARSET *lcs, UDM_CHARSET *bcs);
181 size_t UdmRemoveHiLight(char *dst, size_t dstlength,
182                         const char *src, size_t srclength);
183 size_t UdmRemoveHl(UDM_CHARSET *cs, char *str, size_t from, size_t to);
184 char* UdmRemoveHiLightDup(const char *s);
185 
186 /* Functions from segment.c */
187 int *UdmUniSegment(UDM_AGENT *Indexer, int *s, const char *lang, const char *seg);
188 int *UdmUniSegmentByType(UDM_AGENT *Indexer, int *s, int type, int ch);
189 int UdmUniSegmenterFind(UDM_ENV *Env, const char *lang, const char *seg);
190 udm_rc_t UdmTextListSegment(UDM_AGENT *Indexer, UDM_TEXTLIST *tlist,
191                             UDM_CHARSET *cs, int type);
192 
193 /* Functions from section.c */
194 UDM_SEARCHSECTION *UdmSearchSectionListFind(UDM_SEARCHSECTIONLIST *SectionList,
195                                             urlid_t url_id);
196 void UdmSearchSectionListPrint(UDM_SEARCHSECTIONLIST *SectionList);
197 udm_rc_t UdmSearchSectionListAlloc(UDM_SEARCHSECTIONLIST *List, size_t ncoords, size_t nsections);
198 void UdmSearchSectionListFree(UDM_SEARCHSECTIONLIST *List);
199 udm_rc_t UdmSearchSectionListListAdd(UDM_SEARCHSECTIONLISTLIST *List,
200                                      UDM_SEARCHSECTIONLIST *Item);
201 void UdmSearchSectionListListInit(UDM_SEARCHSECTIONLISTLIST *List);
202 void UdmSearchSectionListListFree(UDM_SEARCHSECTIONLISTLIST *List);
203 udm_rc_t UdmSearchSectionListListMergeSorted(UDM_SEARCHSECTIONLISTLIST *SrcList,
204                                              UDM_SEARCHSECTIONLIST *Dst, int opt);
205 
206 /* Functions from urlidlist.c */
207 int UdmCmpURLID(urlid_t *s1, urlid_t *s2); /* for qsort */
208 udm_rc_t UdmURLIdListJoin(UDM_URLID_LIST *urls, UDM_URLID_LIST *fl_urls);
209 udm_rc_t UdmURLIdListUnion(UDM_URLID_LIST *a, UDM_URLID_LIST *b);
210 udm_rc_t UdmURLIdListCopy(UDM_URLID_LIST *a, UDM_URLID_LIST *b);
211 udm_rc_t UdmURLIdListMerge(UDM_URLID_LIST *a, UDM_URLID_LIST *b);
212 void UdmURLIdListSort(UDM_URLID_LIST *a);
213 
214 /* Functions from popularity.c */
215 udm_rc_t UdmURLDataListUnpackPopularity(UDM_AGENT *A, UDM_URLDATALIST *URLDataList,
216                                         UDM_CONST_STR *cstr);
217 udm_rc_t UdmURLDataListPackPopularity(UDM_AGENT *A, UDM_URLDATALIST *List,
218                                       UDM_DSTR *pop);
219 
220 #endif
221