1# 2014 Dec 19 2# 3# The author disclaims copyright to this source code. In place of 4# a legal notice, here is a blessing: 5# 6# May you do good and not evil. 7# May you find forgiveness for yourself and forgive others. 8# May you share freely, never taking more than you give. 9# 10#*********************************************************************** 11# 12 13if {![info exists testdir]} { 14 set testdir [file join [file dirname [info script]] .. .. .. test] 15} 16source $testdir/tester.tcl 17 18ifcapable !fts5 { 19 proc return_if_no_fts5 {} { 20 finish_test 21 return -code return 22 } 23 return 24} else { 25 proc return_if_no_fts5 {} {} 26} 27 28catch { 29 sqlite3_fts5_may_be_corrupt 0 30 reset_db 31} 32 33proc fts5_test_poslist {cmd} { 34 set res [list] 35 for {set i 0} {$i < [$cmd xInstCount]} {incr i} { 36 lappend res [string map {{ } .} [$cmd xInst $i]] 37 } 38 set res 39} 40 41proc fts5_test_poslist2 {cmd} { 42 set res [list] 43 44 for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} { 45 $cmd xPhraseForeach $i c o { 46 lappend res $i.$c.$o 47 } 48 } 49 50 #set res 51 sort_poslist $res 52} 53 54proc fts5_test_collist {cmd} { 55 set res [list] 56 57 for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} { 58 $cmd xPhraseColumnForeach $i c { lappend res $i.$c } 59 } 60 61 set res 62} 63 64proc fts5_test_columnsize {cmd} { 65 set res [list] 66 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { 67 lappend res [$cmd xColumnSize $i] 68 } 69 set res 70} 71 72proc fts5_test_columntext {cmd} { 73 set res [list] 74 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { 75 lappend res [$cmd xColumnText $i] 76 } 77 set res 78} 79 80proc fts5_test_columntotalsize {cmd} { 81 set res [list] 82 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { 83 lappend res [$cmd xColumnTotalSize $i] 84 } 85 set res 86} 87 88proc test_append_token {varname token iStart iEnd} { 89 upvar $varname var 90 lappend var $token 91 return "SQLITE_OK" 92} 93proc fts5_test_tokenize {cmd} { 94 set res [list] 95 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { 96 set tokens [list] 97 $cmd xTokenize [$cmd xColumnText $i] [list test_append_token tokens] 98 lappend res $tokens 99 } 100 set res 101} 102 103proc fts5_test_rowcount {cmd} { 104 $cmd xRowCount 105} 106 107proc test_queryphrase_cb {cnt cmd} { 108 upvar $cnt L 109 for {set i 0} {$i < [$cmd xInstCount]} {incr i} { 110 foreach {ip ic io} [$cmd xInst $i] break 111 set A($ic) 1 112 } 113 foreach ic [array names A] { 114 lset L $ic [expr {[lindex $L $ic] + 1}] 115 } 116} 117proc fts5_test_queryphrase {cmd} { 118 set res [list] 119 for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} { 120 set cnt [list] 121 for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 } 122 $cmd xQueryPhrase $i [list test_queryphrase_cb cnt] 123 lappend res $cnt 124 } 125 set res 126} 127 128proc fts5_test_phrasecount {cmd} { 129 $cmd xPhraseCount 130} 131 132proc fts5_test_all {cmd} { 133 set res [list] 134 lappend res columnsize [fts5_test_columnsize $cmd] 135 lappend res columntext [fts5_test_columntext $cmd] 136 lappend res columntotalsize [fts5_test_columntotalsize $cmd] 137 lappend res poslist [fts5_test_poslist $cmd] 138 lappend res tokenize [fts5_test_tokenize $cmd] 139 lappend res rowcount [fts5_test_rowcount $cmd] 140 set res 141} 142 143proc fts5_aux_test_functions {db} { 144 foreach f { 145 fts5_test_columnsize 146 fts5_test_columntext 147 fts5_test_columntotalsize 148 fts5_test_poslist 149 fts5_test_poslist2 150 fts5_test_collist 151 fts5_test_tokenize 152 fts5_test_rowcount 153 fts5_test_all 154 155 fts5_test_queryphrase 156 fts5_test_phrasecount 157 } { 158 sqlite3_fts5_create_function $db $f $f 159 } 160} 161 162proc fts5_segcount {tbl} { 163 set N 0 164 foreach n [fts5_level_segs $tbl] { incr N $n } 165 set N 166} 167 168proc fts5_level_segs {tbl} { 169 set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10" 170 set ret [list] 171 foreach L [lrange [db one $sql] 1 end] { 172 lappend ret [expr [llength $L] - 3] 173 } 174 set ret 175} 176 177proc fts5_level_segids {tbl} { 178 set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10" 179 set ret [list] 180 foreach L [lrange [db one $sql] 1 end] { 181 set lvl [list] 182 foreach S [lrange $L 3 end] { 183 regexp {id=([1234567890]*)} $S -> segid 184 lappend lvl $segid 185 } 186 lappend ret $lvl 187 } 188 set ret 189} 190 191proc fts5_rnddoc {n} { 192 set map [list 0 a 1 b 2 c 3 d 4 e 5 f 6 g 7 h 8 i 9 j] 193 set doc [list] 194 for {set i 0} {$i < $n} {incr i} { 195 lappend doc "x[string map $map [format %.3d [expr int(rand()*1000)]]]" 196 } 197 set doc 198} 199 200#------------------------------------------------------------------------- 201# Usage: 202# 203# nearset aCol ?-pc VARNAME? ?-near N? ?-col C? -- phrase1 phrase2... 204# 205# This command is used to test if a document (set of column values) matches 206# the logical equivalent of a single FTS5 NEAR() clump and, if so, return 207# the equivalent of an FTS5 position list. 208# 209# Parameter $aCol is passed a list of the column values for the document 210# to test. Parameters $phrase1 and so on are the phrases. 211# 212# The result is a list of phrase hits. Each phrase hit is formatted as 213# three integers separated by "." characters, in the following format: 214# 215# <phrase number> . <column number> . <token offset> 216# 217# Options: 218# 219# -near N (NEAR distance. Default 10) 220# -col C (List of column indexes to match against) 221# -pc VARNAME (variable in caller frame to use for phrase numbering) 222# -dict VARNAME (array in caller frame to use for synonyms) 223# 224proc nearset {aCol args} { 225 226 # Process the command line options. 227 # 228 set O(-near) 10 229 set O(-col) {} 230 set O(-pc) "" 231 set O(-dict) "" 232 233 set nOpt [lsearch -exact $args --] 234 if {$nOpt<0} { error "no -- option" } 235 236 # Set $lPhrase to be a list of phrases. $nPhrase its length. 237 set lPhrase [lrange $args [expr $nOpt+1] end] 238 set nPhrase [llength $lPhrase] 239 240 foreach {k v} [lrange $args 0 [expr $nOpt-1]] { 241 if {[info exists O($k)]==0} { error "unrecognized option $k" } 242 set O($k) $v 243 } 244 245 if {$O(-pc) == ""} { 246 set counter 0 247 } else { 248 upvar $O(-pc) counter 249 } 250 251 if {$O(-dict)!=""} { upvar $O(-dict) aDict } 252 253 for {set j 0} {$j < [llength $aCol]} {incr j} { 254 for {set i 0} {$i < $nPhrase} {incr i} { 255 set A($j,$i) [list] 256 } 257 } 258 259 # Loop through each column of the current row. 260 for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} { 261 262 # If there is a column filter, test whether this column is excluded. If 263 # so, skip to the next iteration of this loop. Otherwise, set zCol to the 264 # column value and nToken to the number of tokens that comprise it. 265 if {$O(-col)!="" && [lsearch $O(-col) $iCol]<0} continue 266 set zCol [lindex $aCol $iCol] 267 set nToken [llength $zCol] 268 269 # Each iteration of the following loop searches a substring of the 270 # column value for phrase matches. The last token of the substring 271 # is token $iLast of the column value. The first token is: 272 # 273 # iFirst = ($iLast - $O(-near) - 1) 274 # 275 # where $sz is the length of the phrase being searched for. A phrase 276 # counts as matching the substring if its first token lies on or before 277 # $iLast and its last token on or after $iFirst. 278 # 279 # For example, if the query is "NEAR(a+b c, 2)" and the column value: 280 # 281 # "x x x x A B x x C x" 282 # 0 1 2 3 4 5 6 7 8 9" 283 # 284 # when (iLast==8 && iFirst=5) the range will contain both phrases and 285 # so both instances can be added to the output poslists. 286 # 287 set iLast [expr $O(-near) >= $nToken ? $nToken - 1 : $O(-near)] 288 for { } {$iLast < $nToken} {incr iLast} { 289 290 catch { array unset B } 291 292 for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { 293 set p [lindex $lPhrase $iPhrase] 294 set nPm1 [expr {[llength $p] - 1}] 295 set iFirst [expr $iLast - $O(-near) - [llength $p]] 296 297 for {set i $iFirst} {$i <= $iLast} {incr i} { 298 set lCand [lrange $zCol $i [expr $i+$nPm1]] 299 set bMatch 1 300 foreach tok $p term $lCand { 301 if {[nearset_match aDict $tok $term]==0} { set bMatch 0 ; break } 302 } 303 if {$bMatch} { lappend B($iPhrase) $i } 304 } 305 306 if {![info exists B($iPhrase)]} break 307 } 308 309 if {$iPhrase==$nPhrase} { 310 for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { 311 set A($iCol,$iPhrase) [concat $A($iCol,$iPhrase) $B($iPhrase)] 312 set A($iCol,$iPhrase) [lsort -integer -uniq $A($iCol,$iPhrase)] 313 } 314 } 315 } 316 } 317 318 set res [list] 319 #puts [array names A] 320 321 for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { 322 for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} { 323 foreach a $A($iCol,$iPhrase) { 324 lappend res "$counter.$iCol.$a" 325 } 326 } 327 incr counter 328 } 329 330 #puts "$aCol -> $res" 331 sort_poslist $res 332} 333 334proc nearset_match {aDictVar tok term} { 335 if {[string match $tok $term]} { return 1 } 336 337 upvar $aDictVar aDict 338 if {[info exists aDict($tok)]} { 339 foreach s $aDict($tok) { 340 if {[string match $s $term]} { return 1 } 341 } 342 } 343 return 0; 344} 345 346#------------------------------------------------------------------------- 347# Usage: 348# 349# sort_poslist LIST 350# 351# Sort a position list of the type returned by command [nearset] 352# 353proc sort_poslist {L} { 354 lsort -command instcompare $L 355} 356proc instcompare {lhs rhs} { 357 foreach {p1 c1 o1} [split $lhs .] {} 358 foreach {p2 c2 o2} [split $rhs .] {} 359 360 set res [expr $c1 - $c2] 361 if {$res==0} { set res [expr $o1 - $o2] } 362 if {$res==0} { set res [expr $p1 - $p2] } 363 364 return $res 365} 366 367#------------------------------------------------------------------------- 368# Logical operators used by the commands returned by fts5_tcl_expr(). 369# 370proc AND {args} { 371 foreach a $args { 372 if {[llength $a]==0} { return [list] } 373 } 374 sort_poslist [concat {*}$args] 375} 376proc OR {args} { 377 sort_poslist [concat {*}$args] 378} 379proc NOT {a b} { 380 if {[llength $b]>0} { return [list] } 381 return $a 382} 383 384#------------------------------------------------------------------------- 385# This command is similar to [split], except that it also provides the 386# start and end offsets of each token. For example: 387# 388# [fts5_tokenize_split "abc d ef"] -> {abc 0 3 d 4 5 ef 6 8} 389# 390 391proc gobble_whitespace {textvar} { 392 upvar $textvar t 393 regexp {([ ]*)(.*)} $t -> space t 394 return [string length $space] 395} 396 397proc gobble_text {textvar wordvar} { 398 upvar $textvar t 399 upvar $wordvar w 400 regexp {([^ ]*)(.*)} $t -> w t 401 return [string length $w] 402} 403 404proc fts5_tokenize_split {text} { 405 set token "" 406 set ret [list] 407 set iOff [gobble_whitespace text] 408 while {[set nToken [gobble_text text word]]} { 409 lappend ret $word $iOff [expr $iOff+$nToken] 410 incr iOff $nToken 411 incr iOff [gobble_whitespace text] 412 } 413 414 set ret 415} 416 417#------------------------------------------------------------------------- 418# 419proc foreach_detail_mode {prefix script} { 420 set saved $::testprefix 421 foreach d [list full col none] { 422 set s [string map [list %DETAIL% $d] $script] 423 set ::detail $d 424 set ::testprefix "$prefix-$d" 425 reset_db 426 uplevel $s 427 unset ::detail 428 } 429 set ::testprefix $saved 430} 431 432proc detail_check {} { 433 if {$::detail != "none" && $::detail!="full" && $::detail!="col"} { 434 error "not in foreach_detail_mode {...} block" 435 } 436} 437proc detail_is_none {} { detail_check ; expr {$::detail == "none"} } 438proc detail_is_col {} { detail_check ; expr {$::detail == "col" } } 439proc detail_is_full {} { detail_check ; expr {$::detail == "full"} } 440 441 442#------------------------------------------------------------------------- 443# Convert a poslist of the type returned by fts5_test_poslist() to a 444# collist as returned by fts5_test_collist(). 445# 446proc fts5_poslist2collist {poslist} { 447 set res [list] 448 foreach h $poslist { 449 regexp {(.*)\.[1234567890]+} $h -> cand 450 lappend res $cand 451 } 452 set res [lsort -command fts5_collist_elem_compare -unique $res] 453 return $res 454} 455 456# Comparison function used by fts5_poslist2collist to sort collist entries. 457proc fts5_collist_elem_compare {a b} { 458 foreach {a1 a2} [split $a .] {} 459 foreach {b1 b2} [split $b .] {} 460 461 if {$a1==$b1} { return [expr $a2 - $b2] } 462 return [expr $a1 - $b1] 463} 464 465 466#-------------------------------------------------------------------------- 467# Construct and return a tcl list equivalent to that returned by the SQL 468# query executed against database handle [db]: 469# 470# SELECT 471# rowid, 472# fts5_test_poslist($tbl), 473# fts5_test_collist($tbl) 474# FROM $tbl('$expr') 475# ORDER BY rowid $order; 476# 477proc fts5_query_data {expr tbl {order ASC} {aDictVar ""}} { 478 479 # Figure out the set of columns in the FTS5 table. This routine does 480 # not handle tables with UNINDEXED columns, but if it did, it would 481 # have to be here. 482 db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) } 483 484 set d "" 485 if {$aDictVar != ""} { 486 upvar $aDictVar aDict 487 set d aDict 488 } 489 490 set cols "" 491 foreach e $lCols { append cols ", '$e'" } 492 set tclexpr [db one [subst -novar { 493 SELECT fts5_expr_tcl( $expr, 'nearset $cols -dict $d -pc ::pc' [set cols] ) 494 }]] 495 496 set res [list] 497 db eval "SELECT rowid, * FROM $tbl ORDER BY rowid $order" x { 498 set cols [list] 499 foreach col $lCols { lappend cols $x($col) } 500 501 set ::pc 0 502 set rowdata [eval $tclexpr] 503 if {$rowdata != ""} { 504 lappend res $x(rowid) $rowdata [fts5_poslist2collist $rowdata] 505 } 506 } 507 508 set res 509} 510 511#------------------------------------------------------------------------- 512# Similar to [fts5_query_data], but omit the collist field. 513# 514proc fts5_poslist_data {expr tbl {order ASC} {aDictVar ""}} { 515 set res [list] 516 517 if {$aDictVar!=""} { 518 upvar $aDictVar aDict 519 set dict aDict 520 } else { 521 set dict "" 522 } 523 524 foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] { 525 lappend res $rowid $poslist 526 } 527 set res 528} 529 530proc fts5_collist_data {expr tbl {order ASC} {aDictVar ""}} { 531 set res [list] 532 533 if {$aDictVar!=""} { 534 upvar $aDictVar aDict 535 set dict aDict 536 } else { 537 set dict "" 538 } 539 540 foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] { 541 lappend res $rowid $collist 542 } 543 set res 544} 545 546#------------------------------------------------------------------------- 547# 548 549# This command will only work inside a [foreach_detail_mode] block. It tests 550# whether or not expression $expr run on FTS5 table $tbl is supported by 551# the current mode. If so, 1 is returned. If not, 0. 552# 553# detail=full (all queries supported) 554# detail=col (all but phrase queries and NEAR queries) 555# detail=none (all but phrase queries, NEAR queries, and column filters) 556# 557proc fts5_expr_ok {expr tbl} { 558 559 if {![detail_is_full]} { 560 set nearset "nearset_rc" 561 if {[detail_is_col]} { set nearset "nearset_rf" } 562 563 set ::expr_not_ok 0 564 db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) } 565 566 set cols "" 567 foreach e $lCols { append cols ", '$e'" } 568 set ::pc 0 569 set tclexpr [db one [subst -novar { 570 SELECT fts5_expr_tcl( $expr, '[set nearset] $cols -pc ::pc' [set cols] ) 571 }]] 572 eval $tclexpr 573 if {$::expr_not_ok} { return 0 } 574 } 575 576 return 1 577} 578 579# Helper for [fts5_expr_ok] 580proc nearset_rf {aCol args} { 581 set idx [lsearch -exact $args --] 582 if {$idx != [llength $args]-2 || [llength [lindex $args end]]!=1} { 583 set ::expr_not_ok 1 584 } 585 list 586} 587 588# Helper for [fts5_expr_ok] 589proc nearset_rc {aCol args} { 590 nearset_rf $aCol {*}$args 591 if {[lsearch $args -col]>=0} { 592 set ::expr_not_ok 1 593 } 594 list 595} 596 597 598#------------------------------------------------------------------------- 599# Code for a simple Tcl tokenizer that supports synonyms at query time. 600# 601proc tclnum_tokenize {mode tflags text} { 602 foreach {w iStart iEnd} [fts5_tokenize_split $text] { 603 sqlite3_fts5_token $w $iStart $iEnd 604 if {$tflags == $mode && [info exists ::tclnum_syn($w)]} { 605 foreach s $::tclnum_syn($w) { sqlite3_fts5_token -colo $s $iStart $iEnd } 606 } 607 } 608} 609 610proc tclnum_create {args} { 611 set mode query 612 if {[llength $args]} { 613 set mode [lindex $args 0] 614 } 615 if {$mode != "query" && $mode != "document"} { error "bad mode: $mode" } 616 return [list tclnum_tokenize $mode] 617} 618 619proc fts5_tclnum_register {db} { 620 foreach SYNDICT { 621 {zero 0} 622 {one 1 i} 623 {two 2 ii} 624 {three 3 iii} 625 {four 4 iv} 626 {five 5 v} 627 {six 6 vi} 628 {seven 7 vii} 629 {eight 8 viii} 630 {nine 9 ix} 631 632 {a1 a2 a3 a4 a5 a6 a7 a8 a9} 633 {b1 b2 b3 b4 b5 b6 b7 b8 b9} 634 {c1 c2 c3 c4 c5 c6 c7 c8 c9} 635 } { 636 foreach s $SYNDICT { 637 set o [list] 638 foreach x $SYNDICT {if {$x!=$s} {lappend o $x}} 639 set ::tclnum_syn($s) $o 640 } 641 } 642 sqlite3_fts5_create_tokenizer db tclnum tclnum_create 643} 644# 645# End of tokenizer code. 646#------------------------------------------------------------------------- 647 648