1# 2014 Dec 19
2#
3# The author disclaims copyright to this source code.  In place of
4# a legal notice, here is a blessing:
5#
6#    May you do good and not evil.
7#    May you find forgiveness for yourself and forgive others.
8#    May you share freely, never taking more than you give.
9#
10#***********************************************************************
11#
12
13if {![info exists testdir]} {
14  set testdir [file join [file dirname [info script]] .. .. .. test]
15}
16source $testdir/tester.tcl
17
18ifcapable !fts5 {
19  proc return_if_no_fts5 {} {
20    finish_test
21    return -code return
22  }
23  return
24} else {
25  proc return_if_no_fts5 {} {}
26}
27
28catch {
29  sqlite3_fts5_may_be_corrupt 0
30  reset_db
31}
32
33proc fts5_test_poslist {cmd} {
34  set res [list]
35  for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
36    lappend res [string map {{ } .} [$cmd xInst $i]]
37  }
38  set res
39}
40
41proc fts5_test_poslist2 {cmd} {
42  set res [list]
43
44  for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
45    $cmd xPhraseForeach $i c o {
46      lappend res $i.$c.$o
47    }
48  }
49
50  #set res
51  sort_poslist $res
52}
53
54proc fts5_test_collist {cmd} {
55  set res [list]
56
57  for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
58    $cmd xPhraseColumnForeach $i c { lappend res $i.$c }
59  }
60
61  set res
62}
63
64proc fts5_test_columnsize {cmd} {
65  set res [list]
66  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
67    lappend res [$cmd xColumnSize $i]
68  }
69  set res
70}
71
72proc fts5_test_columntext {cmd} {
73  set res [list]
74  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
75    lappend res [$cmd xColumnText $i]
76  }
77  set res
78}
79
80proc fts5_test_columntotalsize {cmd} {
81  set res [list]
82  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
83    lappend res [$cmd xColumnTotalSize $i]
84  }
85  set res
86}
87
88proc test_append_token {varname token iStart iEnd} {
89  upvar $varname var
90  lappend var $token
91  return "SQLITE_OK"
92}
93proc fts5_test_tokenize {cmd} {
94  set res [list]
95  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
96    set tokens [list]
97    $cmd xTokenize [$cmd xColumnText $i] [list test_append_token tokens]
98    lappend res $tokens
99  }
100  set res
101}
102
103proc fts5_test_rowcount {cmd} {
104  $cmd xRowCount
105}
106
107proc test_queryphrase_cb {cnt cmd} {
108  upvar $cnt L
109  for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
110    foreach {ip ic io} [$cmd xInst $i] break
111    set A($ic) 1
112  }
113  foreach ic [array names A] {
114    lset L $ic [expr {[lindex $L $ic] + 1}]
115  }
116}
117proc fts5_test_queryphrase {cmd} {
118  set res [list]
119  for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
120    set cnt [list]
121    for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 }
122    $cmd xQueryPhrase $i [list test_queryphrase_cb cnt]
123    lappend res $cnt
124  }
125  set res
126}
127
128proc fts5_test_phrasecount {cmd} {
129  $cmd xPhraseCount
130}
131
132proc fts5_test_all {cmd} {
133  set res [list]
134  lappend res columnsize      [fts5_test_columnsize $cmd]
135  lappend res columntext      [fts5_test_columntext $cmd]
136  lappend res columntotalsize [fts5_test_columntotalsize $cmd]
137  lappend res poslist         [fts5_test_poslist $cmd]
138  lappend res tokenize        [fts5_test_tokenize $cmd]
139  lappend res rowcount        [fts5_test_rowcount $cmd]
140  set res
141}
142
143proc fts5_aux_test_functions {db} {
144  foreach f {
145    fts5_test_columnsize
146    fts5_test_columntext
147    fts5_test_columntotalsize
148    fts5_test_poslist
149    fts5_test_poslist2
150    fts5_test_collist
151    fts5_test_tokenize
152    fts5_test_rowcount
153    fts5_test_all
154
155    fts5_test_queryphrase
156    fts5_test_phrasecount
157  } {
158    sqlite3_fts5_create_function $db $f $f
159  }
160}
161
162proc fts5_segcount {tbl} {
163  set N 0
164  foreach n [fts5_level_segs $tbl] { incr N $n }
165  set N
166}
167
168proc fts5_level_segs {tbl} {
169  set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10"
170  set ret [list]
171  foreach L [lrange [db one $sql] 1 end] {
172    lappend ret [expr [llength $L] - 3]
173  }
174  set ret
175}
176
177proc fts5_level_segids {tbl} {
178  set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10"
179  set ret [list]
180  foreach L [lrange [db one $sql] 1 end] {
181    set lvl [list]
182    foreach S [lrange $L 3 end] {
183      regexp {id=([1234567890]*)} $S -> segid
184      lappend lvl $segid
185    }
186    lappend ret $lvl
187  }
188  set ret
189}
190
191proc fts5_rnddoc {n} {
192  set map [list 0 a  1 b  2 c  3 d  4 e  5 f  6 g  7 h  8 i  9 j]
193  set doc [list]
194  for {set i 0} {$i < $n} {incr i} {
195    lappend doc "x[string map $map [format %.3d [expr int(rand()*1000)]]]"
196  }
197  set doc
198}
199
200#-------------------------------------------------------------------------
201# Usage:
202#
203#   nearset aCol ?-pc VARNAME? ?-near N? ?-col C? -- phrase1 phrase2...
204#
205# This command is used to test if a document (set of column values) matches
206# the logical equivalent of a single FTS5 NEAR() clump and, if so, return
207# the equivalent of an FTS5 position list.
208#
209# Parameter $aCol is passed a list of the column values for the document
210# to test. Parameters $phrase1 and so on are the phrases.
211#
212# The result is a list of phrase hits. Each phrase hit is formatted as
213# three integers separated by "." characters, in the following format:
214#
215#   <phrase number> . <column number> . <token offset>
216#
217# Options:
218#
219#   -near N        (NEAR distance. Default 10)
220#   -col  C        (List of column indexes to match against)
221#   -pc   VARNAME  (variable in caller frame to use for phrase numbering)
222#   -dict VARNAME  (array in caller frame to use for synonyms)
223#
224proc nearset {aCol args} {
225
226  # Process the command line options.
227  #
228  set O(-near) 10
229  set O(-col)  {}
230  set O(-pc)   ""
231  set O(-dict) ""
232
233  set nOpt [lsearch -exact $args --]
234  if {$nOpt<0} { error "no -- option" }
235
236  # Set $lPhrase to be a list of phrases. $nPhrase its length.
237  set lPhrase [lrange $args [expr $nOpt+1] end]
238  set nPhrase [llength $lPhrase]
239
240  foreach {k v} [lrange $args 0 [expr $nOpt-1]] {
241    if {[info exists O($k)]==0} { error "unrecognized option $k" }
242    set O($k) $v
243  }
244
245  if {$O(-pc) == ""} {
246    set counter 0
247  } else {
248    upvar $O(-pc) counter
249  }
250
251  if {$O(-dict)!=""} { upvar $O(-dict) aDict }
252
253  for {set j 0} {$j < [llength $aCol]} {incr j} {
254    for {set i 0} {$i < $nPhrase} {incr i} {
255      set A($j,$i) [list]
256    }
257  }
258
259  # Loop through each column of the current row.
260  for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} {
261
262    # If there is a column filter, test whether this column is excluded. If
263    # so, skip to the next iteration of this loop. Otherwise, set zCol to the
264    # column value and nToken to the number of tokens that comprise it.
265    if {$O(-col)!="" && [lsearch $O(-col) $iCol]<0} continue
266    set zCol [lindex $aCol $iCol]
267    set nToken [llength $zCol]
268
269    # Each iteration of the following loop searches a substring of the
270    # column value for phrase matches. The last token of the substring
271    # is token $iLast of the column value. The first token is:
272    #
273    #   iFirst = ($iLast - $O(-near) - 1)
274    #
275    # where $sz is the length of the phrase being searched for. A phrase
276    # counts as matching the substring if its first token lies on or before
277    # $iLast and its last token on or after $iFirst.
278    #
279    # For example, if the query is "NEAR(a+b c, 2)" and the column value:
280    #
281    #   "x x x x A B x x C x"
282    #    0 1 2 3 4 5 6 7 8 9"
283    #
284    # when (iLast==8 && iFirst=5) the range will contain both phrases and
285    # so both instances can be added to the output poslists.
286    #
287    set iLast [expr $O(-near) >= $nToken ? $nToken - 1 : $O(-near)]
288    for { } {$iLast < $nToken} {incr iLast} {
289
290      catch { array unset B }
291
292      for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
293        set p [lindex $lPhrase $iPhrase]
294        set nPm1 [expr {[llength $p] - 1}]
295        set iFirst [expr $iLast - $O(-near) - [llength $p]]
296
297        for {set i $iFirst} {$i <= $iLast} {incr i} {
298          set lCand [lrange $zCol $i [expr $i+$nPm1]]
299          set bMatch 1
300          foreach tok $p term $lCand {
301            if {[nearset_match aDict $tok $term]==0} { set bMatch 0 ; break }
302          }
303          if {$bMatch} { lappend B($iPhrase) $i }
304        }
305
306        if {![info exists B($iPhrase)]} break
307      }
308
309      if {$iPhrase==$nPhrase} {
310        for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
311          set A($iCol,$iPhrase) [concat $A($iCol,$iPhrase) $B($iPhrase)]
312          set A($iCol,$iPhrase) [lsort -integer -uniq $A($iCol,$iPhrase)]
313        }
314      }
315    }
316  }
317
318  set res [list]
319  #puts [array names A]
320
321  for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
322    for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} {
323      foreach a $A($iCol,$iPhrase) {
324        lappend res "$counter.$iCol.$a"
325      }
326    }
327    incr counter
328  }
329
330  #puts "$aCol -> $res"
331  sort_poslist $res
332}
333
334proc nearset_match {aDictVar tok term} {
335  if {[string match $tok $term]} { return 1 }
336
337  upvar $aDictVar aDict
338  if {[info exists aDict($tok)]} {
339    foreach s $aDict($tok) {
340      if {[string match $s $term]} { return 1 }
341    }
342  }
343  return 0;
344}
345
346#-------------------------------------------------------------------------
347# Usage:
348#
349#   sort_poslist LIST
350#
351# Sort a position list of the type returned by command [nearset]
352#
353proc sort_poslist {L} {
354  lsort -command instcompare $L
355}
356proc instcompare {lhs rhs} {
357  foreach {p1 c1 o1} [split $lhs .] {}
358  foreach {p2 c2 o2} [split $rhs .] {}
359
360  set res [expr $c1 - $c2]
361  if {$res==0} { set res [expr $o1 - $o2] }
362  if {$res==0} { set res [expr $p1 - $p2] }
363
364  return $res
365}
366
367#-------------------------------------------------------------------------
368# Logical operators used by the commands returned by fts5_tcl_expr().
369#
370proc AND {args} {
371  foreach a $args {
372    if {[llength $a]==0} { return [list] }
373  }
374  sort_poslist [concat {*}$args]
375}
376proc OR {args} {
377  sort_poslist [concat {*}$args]
378}
379proc NOT {a b} {
380  if {[llength $b]>0} { return [list] }
381  return $a
382}
383
384#-------------------------------------------------------------------------
385# This command is similar to [split], except that it also provides the
386# start and end offsets of each token. For example:
387#
388#   [fts5_tokenize_split "abc d ef"] -> {abc 0 3 d 4 5 ef 6 8}
389#
390
391proc gobble_whitespace {textvar} {
392  upvar $textvar t
393  regexp {([ ]*)(.*)} $t -> space t
394  return [string length $space]
395}
396
397proc gobble_text {textvar wordvar} {
398  upvar $textvar t
399  upvar $wordvar w
400  regexp {([^ ]*)(.*)} $t -> w t
401  return [string length $w]
402}
403
404proc fts5_tokenize_split {text} {
405  set token ""
406  set ret [list]
407  set iOff [gobble_whitespace text]
408  while {[set nToken [gobble_text text word]]} {
409    lappend ret $word $iOff [expr $iOff+$nToken]
410    incr iOff $nToken
411    incr iOff [gobble_whitespace text]
412  }
413
414  set ret
415}
416
417#-------------------------------------------------------------------------
418#
419proc foreach_detail_mode {prefix script} {
420  set saved $::testprefix
421  foreach d [list full col none] {
422    set s [string map [list %DETAIL% $d] $script]
423    set ::detail $d
424    set ::testprefix "$prefix-$d"
425    reset_db
426    uplevel $s
427    unset ::detail
428  }
429  set ::testprefix $saved
430}
431
432proc detail_check {} {
433  if {$::detail != "none" && $::detail!="full" && $::detail!="col"} {
434    error "not in foreach_detail_mode {...} block"
435  }
436}
437proc detail_is_none {} { detail_check ; expr {$::detail == "none"} }
438proc detail_is_col {}  { detail_check ; expr {$::detail == "col" } }
439proc detail_is_full {} { detail_check ; expr {$::detail == "full"} }
440
441
442#-------------------------------------------------------------------------
443# Convert a poslist of the type returned by fts5_test_poslist() to a
444# collist as returned by fts5_test_collist().
445#
446proc fts5_poslist2collist {poslist} {
447  set res [list]
448  foreach h $poslist {
449    regexp {(.*)\.[1234567890]+} $h -> cand
450    lappend res $cand
451  }
452  set res [lsort -command fts5_collist_elem_compare -unique $res]
453  return $res
454}
455
456# Comparison function used by fts5_poslist2collist to sort collist entries.
457proc fts5_collist_elem_compare {a b} {
458  foreach {a1 a2} [split $a .] {}
459  foreach {b1 b2} [split $b .] {}
460
461  if {$a1==$b1} { return [expr $a2 - $b2] }
462  return [expr $a1 - $b1]
463}
464
465
466#--------------------------------------------------------------------------
467# Construct and return a tcl list equivalent to that returned by the SQL
468# query executed against database handle [db]:
469#
470#   SELECT
471#     rowid,
472#     fts5_test_poslist($tbl),
473#     fts5_test_collist($tbl)
474#   FROM $tbl('$expr')
475#   ORDER BY rowid $order;
476#
477proc fts5_query_data {expr tbl {order ASC} {aDictVar ""}} {
478
479  # Figure out the set of columns in the FTS5 table. This routine does
480  # not handle tables with UNINDEXED columns, but if it did, it would
481  # have to be here.
482  db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) }
483
484  set d ""
485  if {$aDictVar != ""} {
486    upvar $aDictVar aDict
487    set d aDict
488  }
489
490  set cols ""
491  foreach e $lCols { append cols ", '$e'" }
492  set tclexpr [db one [subst -novar {
493    SELECT fts5_expr_tcl( $expr, 'nearset $cols -dict $d -pc ::pc' [set cols] )
494  }]]
495
496  set res [list]
497  db eval "SELECT rowid, * FROM $tbl ORDER BY rowid $order" x {
498    set cols [list]
499    foreach col $lCols { lappend cols $x($col) }
500
501    set ::pc 0
502    set rowdata [eval $tclexpr]
503    if {$rowdata != ""} {
504      lappend res $x(rowid) $rowdata [fts5_poslist2collist $rowdata]
505    }
506  }
507
508  set res
509}
510
511#-------------------------------------------------------------------------
512# Similar to [fts5_query_data], but omit the collist field.
513#
514proc fts5_poslist_data {expr tbl {order ASC} {aDictVar ""}} {
515  set res [list]
516
517  if {$aDictVar!=""} {
518    upvar $aDictVar aDict
519    set dict aDict
520  } else {
521    set dict ""
522  }
523
524  foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] {
525    lappend res $rowid $poslist
526  }
527  set res
528}
529
530proc fts5_collist_data {expr tbl {order ASC} {aDictVar ""}} {
531  set res [list]
532
533  if {$aDictVar!=""} {
534    upvar $aDictVar aDict
535    set dict aDict
536  } else {
537    set dict ""
538  }
539
540  foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] {
541    lappend res $rowid $collist
542  }
543  set res
544}
545
546#-------------------------------------------------------------------------
547#
548
549# This command will only work inside a [foreach_detail_mode] block. It tests
550# whether or not expression $expr run on FTS5 table $tbl is supported by
551# the current mode. If so, 1 is returned. If not, 0.
552#
553#   detail=full    (all queries supported)
554#   detail=col     (all but phrase queries and NEAR queries)
555#   detail=none    (all but phrase queries, NEAR queries, and column filters)
556#
557proc fts5_expr_ok {expr tbl} {
558
559  if {![detail_is_full]} {
560    set nearset "nearset_rc"
561    if {[detail_is_col]} { set nearset "nearset_rf" }
562
563    set ::expr_not_ok 0
564    db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) }
565
566    set cols ""
567    foreach e $lCols { append cols ", '$e'" }
568    set ::pc 0
569    set tclexpr [db one [subst -novar {
570      SELECT fts5_expr_tcl( $expr, '[set nearset] $cols -pc ::pc' [set cols] )
571    }]]
572    eval $tclexpr
573    if {$::expr_not_ok} { return 0 }
574  }
575
576  return 1
577}
578
579# Helper for [fts5_expr_ok]
580proc nearset_rf {aCol args} {
581  set idx [lsearch -exact $args --]
582  if {$idx != [llength $args]-2 || [llength [lindex $args end]]!=1} {
583    set ::expr_not_ok 1
584  }
585  list
586}
587
588# Helper for [fts5_expr_ok]
589proc nearset_rc {aCol args} {
590  nearset_rf $aCol {*}$args
591  if {[lsearch $args -col]>=0} {
592    set ::expr_not_ok 1
593  }
594  list
595}
596
597
598#-------------------------------------------------------------------------
599# Code for a simple Tcl tokenizer that supports synonyms at query time.
600#
601proc tclnum_tokenize {mode tflags text} {
602  foreach {w iStart iEnd} [fts5_tokenize_split $text] {
603    sqlite3_fts5_token $w $iStart $iEnd
604    if {$tflags == $mode && [info exists ::tclnum_syn($w)]} {
605      foreach s $::tclnum_syn($w)  { sqlite3_fts5_token -colo $s $iStart $iEnd }
606    }
607  }
608}
609
610proc tclnum_create {args} {
611  set mode query
612  if {[llength $args]} {
613    set mode [lindex $args 0]
614  }
615  if {$mode != "query" && $mode != "document"} { error "bad mode: $mode" }
616  return [list tclnum_tokenize $mode]
617}
618
619proc fts5_tclnum_register {db} {
620  foreach SYNDICT {
621    {zero  0}
622    {one   1 i}
623    {two   2 ii}
624    {three 3 iii}
625    {four  4 iv}
626    {five  5 v}
627    {six   6 vi}
628    {seven 7 vii}
629    {eight 8 viii}
630    {nine  9 ix}
631
632    {a1 a2 a3 a4 a5 a6 a7 a8 a9}
633    {b1 b2 b3 b4 b5 b6 b7 b8 b9}
634    {c1 c2 c3 c4 c5 c6 c7 c8 c9}
635  } {
636    foreach s $SYNDICT {
637      set o [list]
638      foreach x $SYNDICT {if {$x!=$s} {lappend o $x}}
639      set ::tclnum_syn($s) $o
640    }
641  }
642  sqlite3_fts5_create_tokenizer db tclnum tclnum_create
643}
644#
645# End of tokenizer code.
646#-------------------------------------------------------------------------
647
648