1/* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18package org.apache.spark.sql.catalyst.expressions 19 20import org.apache.spark.SparkFunSuite 21import org.apache.spark.sql.AnalysisException 22import org.apache.spark.sql.catalyst.dsl.expressions._ 23import org.apache.spark.sql.types.{IntegerType, StringType} 24 25/** 26 * Unit tests for regular expression (regexp) related SQL expressions. 27 */ 28class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { 29 30 /** 31 * Check if a given expression evaluates to an expected output, in case the input is 32 * a literal and in case the input is in the form of a row. 33 * @tparam A type of input 34 * @param mkExpr the expression to test for a given input 35 * @param input value that will be used to create the expression, as literal and in the form 36 * of a row 37 * @param expected the expected output of the expression 38 * @param inputToExpression an implicit conversion from the input type to its corresponding 39 * sql expression 40 */ 41 def checkLiteralRow[A](mkExpr: Expression => Expression, input: A, expected: Any) 42 (implicit inputToExpression: A => Expression): Unit = { 43 checkEvaluation(mkExpr(input), expected) // check literal input 44 45 val regex = 'a.string.at(0) 46 checkEvaluation(mkExpr(regex), expected, create_row(input)) // check row input 47 } 48 49 test("LIKE Pattern") { 50 51 // null handling 52 checkLiteralRow(Literal.create(null, StringType).like(_), "a", null) 53 checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null) 54 checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null) 55 checkEvaluation( 56 Literal.create("a", StringType).like(NonFoldableLiteral.create("a", StringType)), true) 57 checkEvaluation( 58 Literal.create("a", StringType).like(NonFoldableLiteral.create(null, StringType)), null) 59 checkEvaluation( 60 Literal.create(null, StringType).like(NonFoldableLiteral.create("a", StringType)), null) 61 checkEvaluation( 62 Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null) 63 64 // simple patterns 65 checkLiteralRow("abdef" like _, "abdef", true) 66 checkLiteralRow("a_%b" like _, "a\\__b", true) 67 checkLiteralRow("addb" like _, "a_%b", true) 68 checkLiteralRow("addb" like _, "a\\__b", false) 69 checkLiteralRow("addb" like _, "a%\\%b", false) 70 checkLiteralRow("a_%b" like _, "a%\\%b", true) 71 checkLiteralRow("addb" like _, "a%", true) 72 checkLiteralRow("addb" like _, "**", false) 73 checkLiteralRow("abc" like _, "a%", true) 74 checkLiteralRow("abc" like _, "b%", false) 75 checkLiteralRow("abc" like _, "bc%", false) 76 checkLiteralRow("a\nb" like _, "a_b", true) 77 checkLiteralRow("ab" like _, "a%b", true) 78 checkLiteralRow("a\nb" like _, "a%b", true) 79 80 // empty input 81 checkLiteralRow("" like _, "", true) 82 checkLiteralRow("a" like _, "", false) 83 checkLiteralRow("" like _, "a", false) 84 85 // SI-17647 double-escaping backslash 86 checkLiteralRow("""\\\\""" like _, """%\\%""", true) 87 checkLiteralRow("""%%""" like _, """%%""", true) 88 checkLiteralRow("""\__""" like _, """\\\__""", true) 89 checkLiteralRow("""\\\__""" like _, """%\\%\%""", false) 90 checkLiteralRow("""_\\\%""" like _, """%\\""", false) 91 92 // unicode 93 // scalastyle:off nonascii 94 checkLiteralRow("a\u20ACa" like _, "_\u20AC_", true) 95 checkLiteralRow("a€a" like _, "_€_", true) 96 checkLiteralRow("a€a" like _, "_\u20AC_", true) 97 checkLiteralRow("a\u20ACa" like _, "_€_", true) 98 // scalastyle:on nonascii 99 100 // invalid escaping 101 val invalidEscape = intercept[AnalysisException] { 102 evaluate("""a""" like """\a""") 103 } 104 assert(invalidEscape.getMessage.contains("pattern")) 105 106 val endEscape = intercept[AnalysisException] { 107 evaluate("""a""" like """a\""") 108 } 109 assert(endEscape.getMessage.contains("pattern")) 110 111 // case 112 checkLiteralRow("A" like _, "a%", false) 113 checkLiteralRow("a" like _, "A%", false) 114 checkLiteralRow("AaA" like _, "_a_", true) 115 116 // example 117 checkLiteralRow("""%SystemDrive%\Users\John""" like _, """\%SystemDrive\%\\Users%""", true) 118 } 119 120 test("RLIKE Regular Expression") { 121 checkLiteralRow(Literal.create(null, StringType) rlike _, "abdef", null) 122 checkEvaluation("abdef" rlike Literal.create(null, StringType), null) 123 checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null) 124 checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true) 125 checkEvaluation("abdef" rlike NonFoldableLiteral.create(null, StringType), null) 126 checkEvaluation( 127 Literal.create(null, StringType) rlike NonFoldableLiteral.create("abdef", StringType), null) 128 checkEvaluation( 129 Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null) 130 131 checkLiteralRow("abdef" rlike _, "abdef", true) 132 checkLiteralRow("abbbbc" rlike _, "a.*c", true) 133 134 checkLiteralRow("fofo" rlike _, "^fo", true) 135 checkLiteralRow("fo\no" rlike _, "^fo\no$", true) 136 checkLiteralRow("Bn" rlike _, "^Ba*n", true) 137 checkLiteralRow("afofo" rlike _, "fo", true) 138 checkLiteralRow("afofo" rlike _, "^fo", false) 139 checkLiteralRow("Baan" rlike _, "^Ba?n", false) 140 checkLiteralRow("axe" rlike _, "pi|apa", false) 141 checkLiteralRow("pip" rlike _, "^(pi)*$", false) 142 143 checkLiteralRow("abc" rlike _, "^ab", true) 144 checkLiteralRow("abc" rlike _, "^bc", false) 145 checkLiteralRow("abc" rlike _, "^ab", true) 146 checkLiteralRow("abc" rlike _, "^bc", false) 147 148 intercept[java.util.regex.PatternSyntaxException] { 149 evaluate("abbbbc" rlike "**") 150 } 151 intercept[java.util.regex.PatternSyntaxException] { 152 val regex = 'a.string.at(0) 153 evaluate("abbbbc" rlike regex, create_row("**")) 154 } 155 } 156 157 test("RegexReplace") { 158 val row1 = create_row("100-200", "(\\d+)", "num") 159 val row2 = create_row("100-200", "(\\d+)", "###") 160 val row3 = create_row("100-200", "(-)", "###") 161 val row4 = create_row(null, "(\\d+)", "###") 162 val row5 = create_row("100-200", null, "###") 163 val row6 = create_row("100-200", "(-)", null) 164 165 val s = 's.string.at(0) 166 val p = 'p.string.at(1) 167 val r = 'r.string.at(2) 168 169 val expr = RegExpReplace(s, p, r) 170 checkEvaluation(expr, "num-num", row1) 171 checkEvaluation(expr, "###-###", row2) 172 checkEvaluation(expr, "100###200", row3) 173 checkEvaluation(expr, null, row4) 174 checkEvaluation(expr, null, row5) 175 checkEvaluation(expr, null, row6) 176 177 val nonNullExpr = RegExpReplace(Literal("100-200"), Literal("(\\d+)"), Literal("num")) 178 checkEvaluation(nonNullExpr, "num-num", row1) 179 } 180 181 test("RegexExtract") { 182 val row1 = create_row("100-200", "(\\d+)-(\\d+)", 1) 183 val row2 = create_row("100-200", "(\\d+)-(\\d+)", 2) 184 val row3 = create_row("100-200", "(\\d+).*", 1) 185 val row4 = create_row("100-200", "([a-z])", 1) 186 val row5 = create_row(null, "([a-z])", 1) 187 val row6 = create_row("100-200", null, 1) 188 val row7 = create_row("100-200", "([a-z])", null) 189 190 val s = 's.string.at(0) 191 val p = 'p.string.at(1) 192 val r = 'r.int.at(2) 193 194 val expr = RegExpExtract(s, p, r) 195 checkEvaluation(expr, "100", row1) 196 checkEvaluation(expr, "200", row2) 197 checkEvaluation(expr, "100", row3) 198 checkEvaluation(expr, "", row4) // will not match anything, empty string get 199 checkEvaluation(expr, null, row5) 200 checkEvaluation(expr, null, row6) 201 checkEvaluation(expr, null, row7) 202 203 val expr1 = new RegExpExtract(s, p) 204 checkEvaluation(expr1, "100", row1) 205 206 val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1)) 207 checkEvaluation(nonNullExpr, "100", row1) 208 } 209 210 test("SPLIT") { 211 val s1 = 'a.string.at(0) 212 val s2 = 'b.string.at(1) 213 val row1 = create_row("aa2bb3cc", "[1-9]+") 214 val row2 = create_row(null, "[1-9]+") 215 val row3 = create_row("aa2bb3cc", null) 216 217 checkEvaluation( 218 StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1) 219 checkEvaluation( 220 StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1) 221 checkEvaluation(StringSplit(s1, s2), null, row2) 222 checkEvaluation(StringSplit(s1, s2), null, row3) 223 } 224 225} 226