1"""Implementation of fast search algorithms 2Used in select when one of the search fields has a fixed-length size 3 4Instead of taking all the blocks in field files one after the other, a big 5number of blocks are read and the search is made on the buffer 6""" 7 8from datetime import date, datetime 9import itertools 10 11# compatibility with Python 2.3 12try: 13 set([1]) 14except NameError: 15 from sets import Set as set 16 17def rev(s): 18 """ function used to compare strings in decreasing order""" 19 return ''.join([chr(255-ord(c)) for c in s]) 20 21def make_search_func(db,field,value): 22 """Return the search function on a field 23 If value is a pair of values (v1,v2), all blocks between v1 and v2 24 will be returned ; if value is a single value then all blocks with 25 this value will be returned 26 """ 27 bl = db._file[field].block_len # block length 28 if isinstance(value,(list,tuple)): 29 value = list(value) 30 if not len(value)==2: 31 raise ValueError,"If argument is a list, only 2 values \ 32 should be passed (found %s)" %len(value) 33 if not db.fields[field] in [int,float,date,datetime]: 34 raise TypeError,"Search between values is only allowed for " \ 35 "int, float, date and datetime (found %s)" %db.fields[field] 36 db._validate(field,value[0]) 37 db._validate(field,value[1]) 38 value.sort() 39 # convert values in blocks (strings representation in field files) 40 s1,s2 = [ db.f_encode[db.fields[field]](v) for v in value ] 41 # search the common leading characters in s1 and s2 42 common = '' 43 for i in range(len(s1)): 44 if s1[i] == s2[i]: 45 common += s1[i] 46 else: 47 break 48 lc = len(common) 49 Min = s1[lc:] # part of s1 not common with s2 50 Max = s2[lc:] # part of s2 not common with s1 51 52 def _search(buf): 53 """Function searching blocks in the buffer such that 54 s1 <= block <= s2 55 Return a dictionary mapping rank of the block to the block 56 57 The algorithm searches occurences of 'common', then checks 58 that the rest of the block is between Min and Max 59 """ 60 ranks = {} 61 pos = 0 62 while True: 63 # search occurences of the common part between s1 and s2 64 pos = buf.find(common,pos) 65 if pos == -1: 66 break 67 if pos % bl == 0: 68 # pos is a block start 69 block = buf[pos:pos+bl] 70 # rest of the block 71 rest = buf[pos+lc:pos+bl] 72 # compare rest of block to Min and Max 73 if Min <= rest <= Max: 74 ranks[pos/bl] = block 75 pos += 1 76 return ranks 77 78 else: 79 v = db.f_encode[db.fields[field]](value) 80 81 def _search(buf): 82 """Function searching blocks in the buffer such that 83 block == v 84 Return a dictionary mapping rank of the block to the block 85 86 The algorithm searches occurences of the block v in the 87 buffer 88 """ 89 ranks = {} 90 pos = 0 91 while True: 92 pos = buf.find(v,pos) 93 if pos>-1: 94 if pos % bl == 0: 95 # pos is a block start 96 ranks[pos/bl] = buf[pos:pos+bl] 97 pos += 1 98 else: 99 break 100 return ranks 101 102 return _search 103 104def fast_select(db,names,**args): 105 """Handles requests like select(['name'],age=23,name='pierre') when 106 one of the arg keys is fixed length type ; uses a fast search algo 107 instead of browsing all the records 108 109 The search functions are defined for all fixed-length arguments and 110 used to select a subset of record rows in field files 111 """ 112 # fixed and variable length fields 113 f_args = [ (k,v) for k,v in args.iteritems() 114 if hasattr(db._file[k],'block_len') ] 115 v_args = [ (k,v) for (k,v) in args.iteritems() 116 if not hasattr(db._file[k],'block_len') ] 117 arg_names = [ k for k,v in f_args + v_args ] 118 no_args = [ n for n in names if not n in arg_names ] 119 names = arg_names + no_args 120 121 [ db._file[k].seek(0) for k in names + args.keys() ] 122 max_len = max([ db._file[k[0]].block_len for k in f_args ]) 123 num_blocks = db.BLOCKSIZE / max_len 124 funcs = dict([(k,make_search_func(db,k,v)) 125 for (k,v) in f_args]) 126 fl_ranks = [] # absolute ranks in fixed length files 127 bl_offset = 0 # offset of current chunck 128 res = {} 129 while True: 130 buf = {} 131 ranks = {} 132 # read a chunk of num_blocks blocks in each fixed-length file 133 for i,(k,v) in enumerate(f_args): 134 # rank[field] stores the rank of found values in 135 # the buffer, between 0 and num_blocks-1 136 bl = db._file[k].block_len 137 buf[k] = db._file[k].read(num_blocks*bl) 138 ranks[k] = funcs[k](buf[k]) 139 # test end of files 140 if not buf[f_args[0][0]]: 141 break 142 # valid records are those with the same rank in all files 143 rank_set=set(ranks[f_args[0][0]].keys()) 144 if len(f_args)>1: 145 for (k,v) in f_args[1:]: 146 rank_set = rank_set.intersection(set(ranks[k].keys())) 147 for c in rank_set: 148 res[bl_offset+c] = [ ranks[k][c] for k,v in f_args ] 149 bl_offset += num_blocks 150 151 fl_ranks = res.keys() 152 fl_ranks.sort() 153 154 # The field files for the other arguments are browsed ; if their 155 # row is in the subset, test if the value for variable length arguments 156 # is equal to the keyword value 157 vl_files = [ db._file[k] for k,v in v_args ] 158 nbvl = len(vl_files) 159 vl_values = tuple([ db._file[k].to_block(v) for (k,v) in v_args ]) 160 no_args_files = [ db._file[k] for k in no_args ] 161 other_files = vl_files + no_args_files 162 for f in other_files: 163 f.seek(0) 164 165 for i,lines in enumerate(itertools.izip(*other_files)): 166 try: 167 if i == fl_ranks[0]: 168 fl_ranks.pop(0) 169 if lines[:nbvl] == vl_values: 170 res[i]+=list(lines) 171 else: 172 del res[i] 173 except IndexError: 174 break 175 return res,names 176