1 /* Copyright (c) 2005 MySQL AB, 2009 Sun Microsystems, Inc.
2    Use is subject to license terms.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
16 
17 #ifndef _my_plugin_ftparser_h
18 #define _my_plugin_ftparser_h
19 #include "plugin.h"
20 
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24 
25 /*************************************************************************
26   API for Full-text parser plugin. (MYSQL_FTPARSER_PLUGIN)
27 */
28 
29 #define MYSQL_FTPARSER_INTERFACE_VERSION 0x0100
30 
31 /* Parsing modes. Set in  MYSQL_FTPARSER_PARAM::mode */
32 enum enum_ftparser_mode
33 {
34 /*
35   Fast and simple mode.  This mode is used for indexing, and natural
36   language queries.
37 
38   The parser is expected to return only those words that go into the
39   index. Stopwords or too short/long words should not be returned. The
40   'boolean_info' argument of mysql_add_word() does not have to be set.
41 */
42   MYSQL_FTPARSER_SIMPLE_MODE= 0,
43 
44 /*
45   Parse with stopwords mode.  This mode is used in boolean searches for
46   "phrase matching."
47 
48   The parser is not allowed to ignore words in this mode.  Every word
49   should be returned, including stopwords and words that are too short
50   or long.  The 'boolean_info' argument of mysql_add_word() does not
51   have to be set.
52 */
53   MYSQL_FTPARSER_WITH_STOPWORDS= 1,
54 
55 /*
56   Parse in boolean mode.  This mode is used to parse a boolean query string.
57 
58   The parser should provide a valid MYSQL_FTPARSER_BOOLEAN_INFO
59   structure in the 'boolean_info' argument to mysql_add_word().
60   Usually that means that the parser should recognize boolean operators
61   in the parsing stream and set appropriate fields in
62   MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly.  As for
63   MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored.
64   Instead, use FT_TOKEN_STOPWORD for the token type of such a word.
65 */
66   MYSQL_FTPARSER_FULL_BOOLEAN_INFO= 2
67 };
68 
69 /*
70   Token types for boolean mode searching (used for the type member of
71   MYSQL_FTPARSER_BOOLEAN_INFO struct)
72 
73   FT_TOKEN_EOF: End of data.
74   FT_TOKEN_WORD: Regular word.
75   FT_TOKEN_LEFT_PAREN: Left parenthesis (start of group/sub-expression).
76   FT_TOKEN_RIGHT_PAREN: Right parenthesis (end of group/sub-expression).
77   FT_TOKEN_STOPWORD: Stopword.
78 */
79 
80 enum enum_ft_token_type
81 {
82   FT_TOKEN_EOF= 0,
83   FT_TOKEN_WORD= 1,
84   FT_TOKEN_LEFT_PAREN= 2,
85   FT_TOKEN_RIGHT_PAREN= 3,
86   FT_TOKEN_STOPWORD= 4
87 };
88 
89 /*
90   This structure is used in boolean search mode only. It conveys
91   boolean-mode metadata to the MySQL search engine for every word in
92   the search query. A valid instance of this structure must be filled
93   in by the plugin parser and passed as an argument in the call to
94   mysql_add_word (the callback function in the MYSQL_FTPARSER_PARAM
95   structure) when a query is parsed in boolean mode.
96 
97   type: The token type.  Should be one of the enum_ft_token_type values.
98 
99   yesno: Whether the word must be present for a match to occur:
100     >0 Must be present
101     <0 Must not be present
102     0  Neither; the word is optional but its presence increases the relevance
103   With the default settings of the ft_boolean_syntax system variable,
104   >0 corresponds to the '+' operator, <0 corrresponds to the '-' operator,
105   and 0 means neither operator was used.
106 
107   weight_adjust: A weighting factor that determines how much a match
108   for the word counts.  Positive values increase, negative - decrease the
109   relative word's importance in the query.
110 
111   wasign: The sign of the word's weight in the query. If it's non-negative
112   the match for the word will increase document relevance, if it's
113   negative - decrease (the word becomes a "noise word", the less of it the
114   better).
115 
116   trunc: Corresponds to the '*' operator in the default setting of the
117   ft_boolean_syntax system variable.
118 */
119 
120 typedef struct st_mysql_ftparser_boolean_info
121 {
122   enum enum_ft_token_type type;
123   int yesno;
124   int weight_adjust;
125   char wasign;
126   char trunc;
127   /* These are parser state and must be removed. */
128   char prev;
129   char *quot;
130 } MYSQL_FTPARSER_BOOLEAN_INFO;
131 
132 /*
133   The following flag means that buffer with a string (document, word)
134   may be overwritten by the caller before the end of the parsing (that is
135   before st_mysql_ftparser::deinit() call). If one needs the string
136   to survive between two successive calls of the parsing function, she
137   needs to save a copy of it. The flag may be set by MySQL before calling
138   st_mysql_ftparser::parse(), or it may be set by a plugin before calling
139   st_mysql_ftparser_param::mysql_parse() or
140   st_mysql_ftparser_param::mysql_add_word().
141 */
142 #define MYSQL_FTFLAGS_NEED_COPY 1
143 
144 /*
145   An argument of the full-text parser plugin. This structure is
146   filled in by MySQL server and passed to the parsing function of the
147   plugin as an in/out parameter.
148 
149   mysql_parse: A pointer to the built-in parser implementation of the
150   server. It's set by the server and can be used by the parser plugin
151   to invoke the MySQL default parser.  If plugin's role is to extract
152   textual data from .doc, .pdf or .xml content, it might extract
153   plaintext from the content, and then pass the text to the default
154   MySQL parser to be parsed.
155 
156   mysql_add_word: A server callback to add a new word.  When parsing
157   a document, the server sets this to point at a function that adds
158   the word to MySQL full-text index.  When parsing a search query,
159   this function will add the new word to the list of words to search
160   for.  The boolean_info argument can be NULL for all cases except
161   when mode is MYSQL_FTPARSER_FULL_BOOLEAN_INFO. A plugin can replace this
162   callback to post-process every parsed word before passing it to the original
163   mysql_add_word function.
164 
165   ftparser_state: A generic pointer. The plugin can set it to point
166   to information to be used internally for its own purposes.
167 
168   mysql_ftparam: This is set by the server.  It is used by MySQL functions
169   called via mysql_parse() and mysql_add_word() callback.  The plugin
170   should not modify it.
171 
172   cs: Information about the character set of the document or query string.
173 
174   doc: A pointer to the document or query string to be parsed.
175 
176   length: Length of the document or query string, in bytes.
177 
178   flags: See MYSQL_FTFLAGS_* constants above.
179 
180   mode: The parsing mode.  With boolean operators, with stopwords, or
181   nothing.  See  enum_ftparser_mode above.
182 */
183 
184 typedef struct st_mysql_ftparser_param
185 {
186   int (*mysql_parse)(struct st_mysql_ftparser_param *,
187                      const char *doc, int doc_len);
188   int (*mysql_add_word)(struct st_mysql_ftparser_param *,
189                         const char *word, int word_len,
190                         MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info);
191   void *ftparser_state;
192   void *mysql_ftparam;
193   const struct charset_info_st *cs;
194   const char *doc;
195   int length;
196   unsigned int flags;
197   enum enum_ftparser_mode mode;
198 } MYSQL_FTPARSER_PARAM;
199 
200 /*
201   Full-text parser descriptor.
202 
203   interface_version is, e.g., MYSQL_FTPARSER_INTERFACE_VERSION.
204   The parsing, initialization, and deinitialization functions are
205   invoked per SQL statement for which the parser is used.
206 */
207 
208 struct st_mysql_ftparser
209 {
210   int interface_version;
211   int (*parse)(MYSQL_FTPARSER_PARAM *param);
212   int (*init)(MYSQL_FTPARSER_PARAM *param);
213   int (*deinit)(MYSQL_FTPARSER_PARAM *param);
214 };
215 
216 
217 #ifdef __cplusplus
218 }
219 #endif
220 
221 #endif
222 
223