1from __future__ import print_function 2import numpy as np 3import tables 4from time import time 5 6N = 1000 * 1000 7NCOLL = 200 # 200 collections maximum 8 9# In order to have reproducible results 10np.random.seed(19) 11 12 13class Energies(tables.IsDescription): 14 collection = tables.UInt8Col() 15 energy = tables.Float64Col() 16 17 18def fill_bucket(lbucket): 19 #c = np.random.normal(NCOLL/2, NCOLL/10, lbucket) 20 c = np.random.normal(NCOLL / 2, NCOLL / 100, lbucket) 21 e = np.arange(lbucket, dtype='f8') 22 return c, e 23 24# Fill the table 25t1 = time() 26f = tables.open_file("data.nobackup/collations.h5", "w") 27table = f.create_table("/", "Energies", Energies, expectedrows=N) 28# Fill the table with values 29lbucket = 1000 # Fill in buckets of 1000 rows, for speed 30for i in range(0, N, lbucket): 31 bucket = fill_bucket(lbucket) 32 table.append(bucket) 33# Fill the remaining rows 34bucket = fill_bucket(N % lbucket) 35table.append(bucket) 36f.close() 37print("Time to create the table with %d entries: %.3f" % (N, time() - t1)) 38 39# Now, read the table and group it by collection 40f = tables.open_file("data.nobackup/collations.h5", "a") 41table = f.root.Energies 42 43######################################################### 44# First solution: load the table completely in memory 45######################################################### 46t1 = time() 47t = table[:] # convert to structured array 48coll1 = [] 49collections = np.unique(t['collection']) 50for c in collections: 51 cond = t['collection'] == c 52 energy_this_collection = t['energy'][cond] 53 sener = energy_this_collection.sum() 54 coll1.append(sener) 55 print(c, ' : ', sener) 56del collections, energy_this_collection 57print("Time for first solution: %.3f" % (time() - t1)) 58 59######################################################### 60# Second solution: load all the collections in memory 61######################################################### 62t1 = time() 63collections = {} 64for row in table: 65 c = row['collection'] 66 e = row['energy'] 67 if c in collections: 68 collections[c].append(e) 69 else: 70 collections[c] = [e] 71# Convert the lists in numpy arrays 72coll2 = [] 73for c in sorted(collections): 74 energy_this_collection = np.array(collections[c]) 75 sener = energy_this_collection.sum() 76 coll2.append(sener) 77 print(c, ' : ', sener) 78del collections, energy_this_collection 79print("Time for second solution: %.3f" % (time() - t1)) 80 81t1 = time() 82table.cols.collection.create_csindex() 83# table.cols.collection.reindex() 84print("Time for indexing: %.3f" % (time() - t1)) 85 86######################################################### 87# Third solution: load each collection separately 88######################################################### 89t1 = time() 90coll3 = [] 91for c in np.unique(table.col('collection')): 92 energy_this_collection = table.read_where( 93 'collection == c', field='energy') 94 sener = energy_this_collection.sum() 95 coll3.append(sener) 96 print(c, ' : ', sener) 97del energy_this_collection 98print("Time for third solution: %.3f" % (time() - t1)) 99 100 101t1 = time() 102table2 = table.copy('/', 'EnergySortedByCollation', overwrite=True, 103 sortby="collection", propindexes=True) 104print("Time for sorting: %.3f" % (time() - t1)) 105 106##################################################################### 107# Fourth solution: load each collection separately. Sorted table. 108##################################################################### 109t1 = time() 110coll4 = [] 111for c in np.unique(table2.col('collection')): 112 energy_this_collection = table2.read_where( 113 'collection == c', field='energy') 114 sener = energy_this_collection.sum() 115 coll4.append(sener) 116 print(c, ' : ', sener) 117 del energy_this_collection 118print("Time for fourth solution: %.3f" % (time() - t1)) 119 120 121# Finally, check that all solutions do match 122assert coll1 == coll2 == coll3 == coll4 123 124f.close() 125