1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *    http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18package org.apache.spark.sql.catalyst.expressions
19
20import org.apache.spark.SparkFunSuite
21import org.apache.spark.sql.AnalysisException
22import org.apache.spark.sql.catalyst.dsl.expressions._
23import org.apache.spark.sql.types.{IntegerType, StringType}
24
25/**
26 * Unit tests for regular expression (regexp) related SQL expressions.
27 */
28class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
29
30  /**
31   * Check if a given expression evaluates to an expected output, in case the input is
32   * a literal and in case the input is in the form of a row.
33   * @tparam A type of input
34   * @param mkExpr the expression to test for a given input
35   * @param input value that will be used to create the expression, as literal and in the form
36   *        of a row
37   * @param expected the expected output of the expression
38   * @param inputToExpression an implicit conversion from the input type to its corresponding
39   *        sql expression
40   */
41  def checkLiteralRow[A](mkExpr: Expression => Expression, input: A, expected: Any)
42    (implicit inputToExpression: A => Expression): Unit = {
43    checkEvaluation(mkExpr(input), expected) // check literal input
44
45    val regex = 'a.string.at(0)
46    checkEvaluation(mkExpr(regex), expected, create_row(input)) // check row input
47  }
48
49  test("LIKE Pattern") {
50
51    // null handling
52    checkLiteralRow(Literal.create(null, StringType).like(_), "a", null)
53    checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null)
54    checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null)
55    checkEvaluation(
56      Literal.create("a", StringType).like(NonFoldableLiteral.create("a", StringType)), true)
57    checkEvaluation(
58      Literal.create("a", StringType).like(NonFoldableLiteral.create(null, StringType)), null)
59    checkEvaluation(
60      Literal.create(null, StringType).like(NonFoldableLiteral.create("a", StringType)), null)
61    checkEvaluation(
62      Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null)
63
64    // simple patterns
65    checkLiteralRow("abdef" like _, "abdef", true)
66    checkLiteralRow("a_%b" like _, "a\\__b", true)
67    checkLiteralRow("addb" like _, "a_%b", true)
68    checkLiteralRow("addb" like _, "a\\__b", false)
69    checkLiteralRow("addb" like _, "a%\\%b", false)
70    checkLiteralRow("a_%b" like _, "a%\\%b", true)
71    checkLiteralRow("addb" like _, "a%", true)
72    checkLiteralRow("addb" like _, "**", false)
73    checkLiteralRow("abc" like _, "a%", true)
74    checkLiteralRow("abc"  like _, "b%", false)
75    checkLiteralRow("abc"  like _, "bc%", false)
76    checkLiteralRow("a\nb" like _, "a_b", true)
77    checkLiteralRow("ab" like _, "a%b", true)
78    checkLiteralRow("a\nb" like _, "a%b", true)
79
80    // empty input
81    checkLiteralRow("" like _, "", true)
82    checkLiteralRow("a" like _, "", false)
83    checkLiteralRow("" like _, "a", false)
84
85    // SI-17647 double-escaping backslash
86    checkLiteralRow("""\\\\""" like _, """%\\%""", true)
87    checkLiteralRow("""%%""" like _, """%%""", true)
88    checkLiteralRow("""\__""" like _, """\\\__""", true)
89    checkLiteralRow("""\\\__""" like _, """%\\%\%""", false)
90    checkLiteralRow("""_\\\%""" like _, """%\\""", false)
91
92    // unicode
93    // scalastyle:off nonascii
94    checkLiteralRow("a\u20ACa" like _, "_\u20AC_", true)
95    checkLiteralRow("a€a" like _, "_€_", true)
96    checkLiteralRow("a€a" like _, "_\u20AC_", true)
97    checkLiteralRow("a\u20ACa" like _, "_€_", true)
98    // scalastyle:on nonascii
99
100    // invalid escaping
101    val invalidEscape = intercept[AnalysisException] {
102      evaluate("""a""" like """\a""")
103    }
104    assert(invalidEscape.getMessage.contains("pattern"))
105
106    val endEscape = intercept[AnalysisException] {
107      evaluate("""a""" like """a\""")
108    }
109    assert(endEscape.getMessage.contains("pattern"))
110
111    // case
112    checkLiteralRow("A" like _, "a%", false)
113    checkLiteralRow("a" like _, "A%", false)
114    checkLiteralRow("AaA" like _, "_a_", true)
115
116    // example
117    checkLiteralRow("""%SystemDrive%\Users\John""" like _, """\%SystemDrive\%\\Users%""", true)
118  }
119
120  test("RLIKE Regular Expression") {
121    checkLiteralRow(Literal.create(null, StringType) rlike _, "abdef", null)
122    checkEvaluation("abdef" rlike Literal.create(null, StringType), null)
123    checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null)
124    checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true)
125    checkEvaluation("abdef" rlike NonFoldableLiteral.create(null, StringType), null)
126    checkEvaluation(
127      Literal.create(null, StringType) rlike NonFoldableLiteral.create("abdef", StringType), null)
128    checkEvaluation(
129      Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null)
130
131    checkLiteralRow("abdef" rlike _, "abdef", true)
132    checkLiteralRow("abbbbc" rlike _, "a.*c", true)
133
134    checkLiteralRow("fofo" rlike _, "^fo", true)
135    checkLiteralRow("fo\no" rlike _, "^fo\no$", true)
136    checkLiteralRow("Bn" rlike _, "^Ba*n", true)
137    checkLiteralRow("afofo" rlike _, "fo", true)
138    checkLiteralRow("afofo" rlike _, "^fo", false)
139    checkLiteralRow("Baan" rlike _, "^Ba?n", false)
140    checkLiteralRow("axe" rlike _, "pi|apa", false)
141    checkLiteralRow("pip" rlike _, "^(pi)*$", false)
142
143    checkLiteralRow("abc"  rlike _, "^ab", true)
144    checkLiteralRow("abc"  rlike _, "^bc", false)
145    checkLiteralRow("abc"  rlike _, "^ab", true)
146    checkLiteralRow("abc"  rlike _, "^bc", false)
147
148    intercept[java.util.regex.PatternSyntaxException] {
149      evaluate("abbbbc" rlike "**")
150    }
151    intercept[java.util.regex.PatternSyntaxException] {
152      val regex = 'a.string.at(0)
153      evaluate("abbbbc" rlike regex, create_row("**"))
154    }
155  }
156
157  test("RegexReplace") {
158    val row1 = create_row("100-200", "(\\d+)", "num")
159    val row2 = create_row("100-200", "(\\d+)", "###")
160    val row3 = create_row("100-200", "(-)", "###")
161    val row4 = create_row(null, "(\\d+)", "###")
162    val row5 = create_row("100-200", null, "###")
163    val row6 = create_row("100-200", "(-)", null)
164
165    val s = 's.string.at(0)
166    val p = 'p.string.at(1)
167    val r = 'r.string.at(2)
168
169    val expr = RegExpReplace(s, p, r)
170    checkEvaluation(expr, "num-num", row1)
171    checkEvaluation(expr, "###-###", row2)
172    checkEvaluation(expr, "100###200", row3)
173    checkEvaluation(expr, null, row4)
174    checkEvaluation(expr, null, row5)
175    checkEvaluation(expr, null, row6)
176
177    val nonNullExpr = RegExpReplace(Literal("100-200"), Literal("(\\d+)"), Literal("num"))
178    checkEvaluation(nonNullExpr, "num-num", row1)
179  }
180
181  test("RegexExtract") {
182    val row1 = create_row("100-200", "(\\d+)-(\\d+)", 1)
183    val row2 = create_row("100-200", "(\\d+)-(\\d+)", 2)
184    val row3 = create_row("100-200", "(\\d+).*", 1)
185    val row4 = create_row("100-200", "([a-z])", 1)
186    val row5 = create_row(null, "([a-z])", 1)
187    val row6 = create_row("100-200", null, 1)
188    val row7 = create_row("100-200", "([a-z])", null)
189
190    val s = 's.string.at(0)
191    val p = 'p.string.at(1)
192    val r = 'r.int.at(2)
193
194    val expr = RegExpExtract(s, p, r)
195    checkEvaluation(expr, "100", row1)
196    checkEvaluation(expr, "200", row2)
197    checkEvaluation(expr, "100", row3)
198    checkEvaluation(expr, "", row4) // will not match anything, empty string get
199    checkEvaluation(expr, null, row5)
200    checkEvaluation(expr, null, row6)
201    checkEvaluation(expr, null, row7)
202
203    val expr1 = new RegExpExtract(s, p)
204    checkEvaluation(expr1, "100", row1)
205
206    val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1))
207    checkEvaluation(nonNullExpr, "100", row1)
208  }
209
210  test("SPLIT") {
211    val s1 = 'a.string.at(0)
212    val s2 = 'b.string.at(1)
213    val row1 = create_row("aa2bb3cc", "[1-9]+")
214    val row2 = create_row(null, "[1-9]+")
215    val row3 = create_row("aa2bb3cc", null)
216
217    checkEvaluation(
218      StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1)
219    checkEvaluation(
220      StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1)
221    checkEvaluation(StringSplit(s1, s2), null, row2)
222    checkEvaluation(StringSplit(s1, s2), null, row3)
223  }
224
225}
226