1<?php
2/*
3 *  $Id$
4 *
5 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
6 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
7 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
8 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
9 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
10 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
11 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
12 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
13 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
14 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
15 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
16 *
17 * This software consists of voluntary contributions made by many individuals
18 * and is licensed under the LGPL. For more information, see
19 * <http://www.doctrine-project.org>.
20 */
21
22/**
23 * Doctrine_Query_Tokenizer
24 *
25 * @package     Doctrine
26 * @subpackage  Query
27 * @license     http://www.opensource.org/licenses/lgpl-license.php LGPL
28 * @link        www.doctrine-project.org
29 * @since       1.0
30 * @version     $Revision$
31 * @author      Konsta Vesterinen <kvesteri@cc.hut.fi>
32 * @author      Guilherme Blanco <guilhermeblanco@hotmail.com>
33 * @author      Stefan Klug <stefan.klug@googlemail.com>
34 */
35class Doctrine_Query_Tokenizer
36{
37
38    /**
39     * Splits the given dql query into an array where keys represent different
40     * query part names and values are arrays splitted using sqlExplode method
41     *
42     * example:
43     *
44     * parameter:
45     *     $query = "SELECT u.* FROM User u WHERE u.name LIKE ?"
46     * returns:
47     *     array(
48     *         'select' => array('u.*'),
49     *         'from'   => array('User', 'u'),
50     *         'where'  => array('u.name', 'LIKE', '?')
51     *     );
52     *
53     * @param string $query             DQL query
54     *
55     * @throws Doctrine_Query_Exception If some generic parsing error occurs
56     *
57     * @return array                    An array containing the query string parts
58     */
59    public function tokenizeQuery($query)
60    {
61        $tokens = $this->sqlExplode($query, ' ');
62        $parts = array();
63
64        foreach ($tokens as $index => $token) {
65            $token = trim($token);
66
67            switch (strtolower($token)) {
68                case 'delete':
69                case 'update':
70                case 'select':
71                case 'set':
72                case 'from':
73                case 'where':
74                case 'limit':
75                case 'offset':
76                case 'having':
77                    $p = $token;
78                    //$parts[$token] = array();
79                    $parts[$token] = '';
80                break;
81
82                case 'order':
83                case 'group':
84                    $i = ($index + 1);
85                    if (isset($tokens[$i]) && strtolower($tokens[$i]) === 'by') {
86                        $p = $token;
87                        $parts[$token] = '';
88                        //$parts[$token] = array();
89                    } else {
90                        $parts[$p] .= "$token ";
91                        //$parts[$p][] = $token;
92                    }
93                break;
94
95                case 'by':
96                    break;
97
98                default:
99                    if ( ! isset($p)) {
100                        throw new Doctrine_Query_Tokenizer_Exception(
101                            "Couldn't tokenize query. Encountered invalid token: '$token'."
102                        );
103                    }
104
105                    $parts[$p] .= "$token ";
106                    //$parts[$p][] = $token;
107            }
108        }
109
110        return $parts;
111    }
112
113    /**
114     * Trims brackets from string
115     *
116     * @param string $str String to remove the brackets
117     * @param string $e1  First bracket, usually '('
118     * @param string $e2  Second bracket, usually ')'
119     *
120     * @return string
121     */
122    public function bracketTrim($str, $e1 = '(', $e2 = ')')
123    {
124        if (substr($str, 0, 1) === $e1 && substr($str, -1) === $e2) {
125            return substr($str, 1, -1);
126        } else {
127            return $str;
128        }
129    }
130
131    /**
132     * Explodes a sql expression respecting bracket placement.
133     *
134     * This method transform a sql expression in an array of simple clauses,
135     * while observing the parentheses precedence.
136     *
137     * Note: bracketExplode always trims the returned pieces
138     *
139     * <code>
140     * $str = (age < 20 AND age > 18) AND email LIKE 'John@example.com'
141     * $clauses = $tokenizer->bracketExplode($str, ' AND ', '(', ')');
142     * // array("(age < 20 AND age > 18)", "email LIKE 'John@example.com'")
143     * </code>
144     *
145     * @param string $str String to be bracket exploded
146     * @param string $d   Delimeter which explodes the string
147     * @param string $e1  First bracket, usually '('
148     * @param string $e2  Second bracket, usually ')'
149     *
150     * @return array
151     */
152    public function bracketExplode($str, $d = ' ', $e1 = '(', $e2 = ')')
153    {
154        if (is_string($d)) {
155            $d = array($d);
156        }
157
158        // Bracket explode has to be case insensitive
159        $regexp = $this->getSplitRegExpFromArray($d) . 'i';
160        $terms = $this->clauseExplodeRegExp($str, $regexp, $e1, $e2);
161
162        $res = array();
163
164        // Trim is here for historical reasons
165        foreach ($terms as $value) {
166            $res[] = trim($value[0]);
167        }
168
169        return $res;
170    }
171
172    /**
173     * Explode quotes from string
174     *
175     * Note: quoteExplode always trims the returned pieces
176     *
177     * example:
178     *
179     * parameters:
180     *     $str = email LIKE 'John@example.com'
181     *     $d = ' LIKE '
182     *
183     * would return an array:
184     *     array("email", "LIKE", "'John@example.com'")
185     *
186     * @param string $str String to be quote exploded
187     * @param string $d   Delimeter which explodes the string
188     *
189     * @return array
190     */
191    public function quoteExplode($str, $d = ' ')
192    {
193        if (is_string($d)) {
194            $d = array($d);
195        }
196
197        // According to the testcases quoteExplode is case insensitive
198        $regexp = $this->getSplitRegExpFromArray($d) . 'i';
199        $terms = $this->clauseExplodeCountBrackets($str, $regexp);
200
201        $res = array();
202
203        foreach ($terms as $val) {
204            $res[] = trim($val[0]);
205        }
206
207        return $res;
208    }
209
210    /**
211     * Explodes a string into array using custom brackets and
212     * quote delimeters
213     *
214     * Note: sqlExplode trims all returned parts
215     *
216     * example:
217     *
218     * parameters:
219     *     $str = "(age < 20 AND age > 18) AND name LIKE 'John Doe'"
220     *     $d   = ' '
221     *     $e1  = '('
222     *     $e2  = ')'
223     *
224     * would return an array:
225     *     array(
226     *         '(age < 20 AND age > 18)',
227     *         'name',
228     *         'LIKE',
229     *         'John Doe'
230     *     );
231     *
232     * @param string $str String to be SQL exploded
233     * @param string $d   Delimeter which explodes the string
234     * @param string $e1  First bracket, usually '('
235     * @param string $e2  Second bracket, usually ')'
236     *
237     * @return array
238     */
239    public function sqlExplode($str, $d = ' ', $e1 = '(', $e2 = ')')
240    {
241        if (is_string($d)) {
242            $d = array($d);
243        }
244
245        $terms = $this->clauseExplode($str, $d, $e1, $e2);
246        $res = array();
247
248        foreach ($terms as $value) {
249            $res[] = trim($value[0]);
250        }
251
252        return $res;
253    }
254
255    /**
256     * Explodes a string into array using custom brackets and quote delimeters
257     * Each array element is a array of length 2 where the first entry contains
258     * the term, and the second entry contains the corresponding delimiter
259     *
260     * example:
261     *
262     * parameters:
263     *     $str = "(age < 20 AND age > 18) AND name LIKE 'John'+' Doe'"
264     *     $d   = array(' ', '+')
265     *     $e1  = '('
266     *     $e2  = ')'
267     *
268     * would return an array:
269     *     array(
270     *         array('(age < 20 AND age > 18)', ' '),
271     *         array('AND',  ' '),
272     *         array('name', ' '),
273     *         array('LIKE', ' '),
274     *         array('John', '+'),
275     *         array(' Doe', '')
276     *     );
277     *
278     * @param string $str String to be clause exploded
279     * @param string $d   Delimeter which explodes the string
280     * @param string $e1  First bracket, usually '('
281     * @param string $e2  Second bracket, usually ')'
282     *
283     * @return array
284     */
285    public function clauseExplode($str, array $d, $e1 = '(', $e2 = ')')
286    {
287        $regexp = $this->getSplitRegExpFromArray($d);
288
289        return $this->clauseExplodeRegExp($str, $regexp, $e1, $e2);
290    }
291
292    /**
293     * Builds regular expression for split from array. Return regular
294     * expression to be applied
295     *
296     * @param $d
297     *
298     * @return string
299     */
300    private function getSplitRegExpFromArray(array $d)
301    {
302        foreach ($d as $key => $string) {
303            $escapedString = preg_quote($string);
304            if (preg_match('#^\w+$#', $string)) {
305                $escapedString = "\W$escapedString\W";
306            }
307            $d[$key] = $escapedString;
308        }
309
310        if (in_array(' ', $d)) {
311            $d[] = '\s';
312        }
313
314        return '#(' . implode('|', $d) . ')#';
315    }
316
317    /**
318     * Same as clauseExplode, but you give a regexp, which splits the string
319     *
320     * @param $str
321     * @param $regexp
322     * @param $e1
323     * @param $e2
324     *
325     * @return array
326     */
327    private function clauseExplodeRegExp($str, $regexp, $e1 = '(', $e2 = ')')
328    {
329        $terms = $this->clauseExplodeCountBrackets($str, $regexp, $e1, $e2);
330        $terms = $this->mergeBracketTerms($terms);
331
332        // This is only here to comply with the old function signature
333        foreach ($terms as & $val) {
334            unset($val[2]);
335        }
336
337        return $terms;
338    }
339
340    /**
341     * this function is like clauseExplode, but it doesn't merge bracket terms
342     *
343     * @param $str
344     * @param $d
345     * @param $e1
346     * @param $e2
347     *
348     * @return unknown_type
349     */
350    private function clauseExplodeCountBrackets($str, $regexp, $e1 = '(', $e2 = ')')
351    {
352        $quoteTerms = $this->quotedStringExplode($str);
353        $terms = array();
354        $i = 0;
355
356        foreach ($quoteTerms as $key => $val) {
357            if ($key & 1) { // a quoted string
358               // If the last term had no ending delimiter, we append the string to the element,
359               // otherwise, we create a new element without delimiter
360               if ($terms[$i - 1][1] == '') {
361                   $terms[$i - 1][0] .= $val;
362               } else {
363                   $terms[$i++] = array($val, '', 0);
364               }
365            } else { // Not a quoted string
366                // Do the clause explode
367                $subterms = $this->clauseExplodeNonQuoted($val, $regexp);
368
369                foreach ($subterms as &$sub) {
370                    $c1 = substr_count($sub[0], $e1);
371                    $c2 = substr_count($sub[0], $e2);
372
373                    $sub[2] = $c1 - $c2;
374                }
375
376                // If the previous term had no delimiter, merge them
377                if ($i > 0 && $terms[$i - 1][1] == '') {
378                    $first = array_shift($subterms);
379                    $idx = $i - 1;
380
381                    $terms[$idx][0] .= $first[0];
382                    $terms[$idx][1] = $first[1];
383                    $terms[$idx][2] += $first[2];
384                }
385
386                $terms = array_merge($terms, $subterms);
387                $i += sizeof($subterms);
388            }
389        }
390
391        return $terms;
392    }
393
394    /**
395     * Explodes a string by the given delimiters, and counts quotes in every
396     * term. This function doesn't respect quoted strings.
397     * The returned array contains a array per term. These term array contain
398     * the following elemnts:
399     * [0] = the term itself
400     * [1] = the delimiter splitting this term from the next
401     * [2] = the sum of opening and closing brackets in this term
402     *          (eg. -2 means 2 closing brackets (or 1 opening and 3 closing))
403     *
404     * example:
405     *
406     * parameters:
407     *     $str = "a (b '(c+d))'"
408     *     $d = array(' ', '+')
409     *
410     * returns:
411     *     array(
412     *        array('a', ' ', 0),
413     *        array('(b', ' ', 1),
414     *        array("'(c", '+', 1),
415     *        array("d))'", '', -2)
416     *     );
417     *
418     * @param $str
419     * @param $d
420     * @param $e1
421     * @param $e2
422     *
423     * @return array
424     */
425    private function clauseExplodeNonQuoted($str, $regexp)
426    {
427        $str = preg_split($regexp, $str, -1, PREG_SPLIT_DELIM_CAPTURE);
428        $term = array();
429        $i = 0;
430
431        foreach ($str as $key => $val) {
432            // Every odd entry is a delimiter, so add it to the previous term entry
433            if ( ! ($key & 1)) {
434                $term[$i] = array($val, '');
435            } else {
436                $term[$i++][1] = $val;
437            }
438        }
439
440        return $term;
441    }
442
443    /**
444     * This expects input from clauseExplodeNonQuoted.
445     * It will go through the result and merges any bracket terms with
446     * unbalanced bracket count.
447     * Note that only the third parameter in each term is used to get the
448     * bracket overhang. This is needed to be able to handle quoted strings
449     * wich contain brackets
450     *
451     * example:
452     *
453     * parameters:
454     *     $terms = array(
455     *         array("'a(b'", '+', 0)
456     *         array('(2', '+', 1),
457     *         array('3)', '-', -1),
458     *         array('5', '' , '0')
459     *     );
460     *
461     * would return:
462     *     array(
463     *         array("'a(b'", '+', 0),
464     *         array('(2+3)', '-', 0),
465     *         array('5'    , '' , 0)
466     *     );
467     *
468     * @param $terms array
469     *
470     * @return array
471     */
472    private function mergeBracketTerms(array $terms)
473    {
474        $res = array();
475        $i = 0;
476
477        foreach ($terms as $val) {
478            if ( ! isset($res[$i])) {
479                $res[$i] = array($val[0], $val[1], $val[2]);
480            } else {
481                $res[$i][0] .= $res[$i][1] . $val[0];
482                $res[$i][1] = $val[1];
483                $res[$i][2] += $val[2];
484            }
485
486            // Bracket overhang
487            if ($res[$i][2] == 0) {
488                $i++;
489            }
490        }
491
492        return $res;
493    }
494
495
496    /**
497     * Explodes the given string by <quoted words>
498     *
499     * example:
500     *
501     * paramters:
502     *     $str ="'a' AND name = 'John O\'Connor'"
503     *
504     * returns
505     *     array("", "'a'", " AND name = ", "'John O\'Connor'")
506     *
507     * Note the trailing empty string. In the result, all even elements are quoted strings.
508     *
509     * @param $str the string to split
510     *
511     * @return array
512     */
513    public function quotedStringExplode($str)
514    {
515        // Split by all possible incarnations of a quote
516        $split = array("\\'","''","'", "\\\"", "\"\"", "\"");
517        foreach ($split as &$v) {
518            $v = preg_quote($v);
519        }
520        $split = '#(' . implode('|', $split) . ')#';
521        $str = preg_split($split, $str, -1, PREG_SPLIT_DELIM_CAPTURE);
522
523        $parts = array();
524        $mode = false; // Mode is either ' or " if the loop is inside a string quoted with ' or "
525        $i = 0;
526
527        foreach ($str as $key => $val) {
528            // This is some kind of quote
529            if ($key & 1) {
530                if ( ! $mode) {
531                    if ($val == "'" || $val == "\"") {
532                        $mode = $val;
533                        $i++;
534                    }
535                } else if ($mode == $val) {
536                    if ( ! isset($parts[$i])) {
537                        $parts[$i] = $val;
538                    } else {
539                        $parts[$i] .= $val;
540                    }
541
542                    $mode = false;
543                    $i++;
544
545                    continue;
546                }
547            }
548
549            if ( ! isset($parts[$i])) {
550                $parts[$i] = $val;
551            } else {
552                $parts[$i] .= $val;
553            }
554        }
555
556        return $parts;
557    }
558}
559