1 /* Copyright (c) 2005 MySQL AB, 2009 Sun Microsystems, Inc. 2 Use is subject to license terms. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; version 2 of the License. 7 8 This program is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 GNU General Public License for more details. 12 13 You should have received a copy of the GNU General Public License 14 along with this program; if not, write to the Free Software 15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ 16 17 #ifndef _my_plugin_ftparser_h 18 #define _my_plugin_ftparser_h 19 #include "plugin.h" 20 21 #ifdef __cplusplus 22 extern "C" { 23 #endif 24 25 /************************************************************************* 26 API for Full-text parser plugin. (MYSQL_FTPARSER_PLUGIN) 27 */ 28 29 #define MYSQL_FTPARSER_INTERFACE_VERSION 0x0100 30 31 /* Parsing modes. Set in MYSQL_FTPARSER_PARAM::mode */ 32 enum enum_ftparser_mode 33 { 34 /* 35 Fast and simple mode. This mode is used for indexing, and natural 36 language queries. 37 38 The parser is expected to return only those words that go into the 39 index. Stopwords or too short/long words should not be returned. The 40 'boolean_info' argument of mysql_add_word() does not have to be set. 41 */ 42 MYSQL_FTPARSER_SIMPLE_MODE= 0, 43 44 /* 45 Parse with stopwords mode. This mode is used in boolean searches for 46 "phrase matching." 47 48 The parser is not allowed to ignore words in this mode. Every word 49 should be returned, including stopwords and words that are too short 50 or long. The 'boolean_info' argument of mysql_add_word() does not 51 have to be set. 52 */ 53 MYSQL_FTPARSER_WITH_STOPWORDS= 1, 54 55 /* 56 Parse in boolean mode. This mode is used to parse a boolean query string. 57 58 The parser should provide a valid MYSQL_FTPARSER_BOOLEAN_INFO 59 structure in the 'boolean_info' argument to mysql_add_word(). 60 Usually that means that the parser should recognize boolean operators 61 in the parsing stream and set appropriate fields in 62 MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As for 63 MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored. 64 Instead, use FT_TOKEN_STOPWORD for the token type of such a word. 65 */ 66 MYSQL_FTPARSER_FULL_BOOLEAN_INFO= 2 67 }; 68 69 /* 70 Token types for boolean mode searching (used for the type member of 71 MYSQL_FTPARSER_BOOLEAN_INFO struct) 72 73 FT_TOKEN_EOF: End of data. 74 FT_TOKEN_WORD: Regular word. 75 FT_TOKEN_LEFT_PAREN: Left parenthesis (start of group/sub-expression). 76 FT_TOKEN_RIGHT_PAREN: Right parenthesis (end of group/sub-expression). 77 FT_TOKEN_STOPWORD: Stopword. 78 */ 79 80 enum enum_ft_token_type 81 { 82 FT_TOKEN_EOF= 0, 83 FT_TOKEN_WORD= 1, 84 FT_TOKEN_LEFT_PAREN= 2, 85 FT_TOKEN_RIGHT_PAREN= 3, 86 FT_TOKEN_STOPWORD= 4 87 }; 88 89 /* 90 This structure is used in boolean search mode only. It conveys 91 boolean-mode metadata to the MySQL search engine for every word in 92 the search query. A valid instance of this structure must be filled 93 in by the plugin parser and passed as an argument in the call to 94 mysql_add_word (the callback function in the MYSQL_FTPARSER_PARAM 95 structure) when a query is parsed in boolean mode. 96 97 type: The token type. Should be one of the enum_ft_token_type values. 98 99 yesno: Whether the word must be present for a match to occur: 100 >0 Must be present 101 <0 Must not be present 102 0 Neither; the word is optional but its presence increases the relevance 103 With the default settings of the ft_boolean_syntax system variable, 104 >0 corresponds to the '+' operator, <0 corrresponds to the '-' operator, 105 and 0 means neither operator was used. 106 107 weight_adjust: A weighting factor that determines how much a match 108 for the word counts. Positive values increase, negative - decrease the 109 relative word's importance in the query. 110 111 wasign: The sign of the word's weight in the query. If it's non-negative 112 the match for the word will increase document relevance, if it's 113 negative - decrease (the word becomes a "noise word", the less of it the 114 better). 115 116 trunc: Corresponds to the '*' operator in the default setting of the 117 ft_boolean_syntax system variable. 118 */ 119 120 typedef struct st_mysql_ftparser_boolean_info 121 { 122 enum enum_ft_token_type type; 123 int yesno; 124 int weight_adjust; 125 char wasign; 126 char trunc; 127 /* These are parser state and must be removed. */ 128 char prev; 129 char *quot; 130 } MYSQL_FTPARSER_BOOLEAN_INFO; 131 132 /* 133 The following flag means that buffer with a string (document, word) 134 may be overwritten by the caller before the end of the parsing (that is 135 before st_mysql_ftparser::deinit() call). If one needs the string 136 to survive between two successive calls of the parsing function, she 137 needs to save a copy of it. The flag may be set by MySQL before calling 138 st_mysql_ftparser::parse(), or it may be set by a plugin before calling 139 st_mysql_ftparser_param::mysql_parse() or 140 st_mysql_ftparser_param::mysql_add_word(). 141 */ 142 #define MYSQL_FTFLAGS_NEED_COPY 1 143 144 /* 145 An argument of the full-text parser plugin. This structure is 146 filled in by MySQL server and passed to the parsing function of the 147 plugin as an in/out parameter. 148 149 mysql_parse: A pointer to the built-in parser implementation of the 150 server. It's set by the server and can be used by the parser plugin 151 to invoke the MySQL default parser. If plugin's role is to extract 152 textual data from .doc, .pdf or .xml content, it might extract 153 plaintext from the content, and then pass the text to the default 154 MySQL parser to be parsed. 155 156 mysql_add_word: A server callback to add a new word. When parsing 157 a document, the server sets this to point at a function that adds 158 the word to MySQL full-text index. When parsing a search query, 159 this function will add the new word to the list of words to search 160 for. The boolean_info argument can be NULL for all cases except 161 when mode is MYSQL_FTPARSER_FULL_BOOLEAN_INFO. A plugin can replace this 162 callback to post-process every parsed word before passing it to the original 163 mysql_add_word function. 164 165 ftparser_state: A generic pointer. The plugin can set it to point 166 to information to be used internally for its own purposes. 167 168 mysql_ftparam: This is set by the server. It is used by MySQL functions 169 called via mysql_parse() and mysql_add_word() callback. The plugin 170 should not modify it. 171 172 cs: Information about the character set of the document or query string. 173 174 doc: A pointer to the document or query string to be parsed. 175 176 length: Length of the document or query string, in bytes. 177 178 flags: See MYSQL_FTFLAGS_* constants above. 179 180 mode: The parsing mode. With boolean operators, with stopwords, or 181 nothing. See enum_ftparser_mode above. 182 */ 183 184 typedef struct st_mysql_ftparser_param 185 { 186 int (*mysql_parse)(struct st_mysql_ftparser_param *, 187 const char *doc, int doc_len); 188 int (*mysql_add_word)(struct st_mysql_ftparser_param *, 189 const char *word, int word_len, 190 MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info); 191 void *ftparser_state; 192 void *mysql_ftparam; 193 const struct charset_info_st *cs; 194 const char *doc; 195 int length; 196 unsigned int flags; 197 enum enum_ftparser_mode mode; 198 } MYSQL_FTPARSER_PARAM; 199 200 /* 201 Full-text parser descriptor. 202 203 interface_version is, e.g., MYSQL_FTPARSER_INTERFACE_VERSION. 204 The parsing, initialization, and deinitialization functions are 205 invoked per SQL statement for which the parser is used. 206 */ 207 208 struct st_mysql_ftparser 209 { 210 int interface_version; 211 int (*parse)(MYSQL_FTPARSER_PARAM *param); 212 int (*init)(MYSQL_FTPARSER_PARAM *param); 213 int (*deinit)(MYSQL_FTPARSER_PARAM *param); 214 }; 215 216 217 #ifdef __cplusplus 218 } 219 #endif 220 221 #endif 222 223