1from __future__ import print_function
2import numpy as np
3import tables
4from time import time
5
6N = 1000 * 1000
7NCOLL = 200  # 200 collections maximum
8
9# In order to have reproducible results
10np.random.seed(19)
11
12
13class Energies(tables.IsDescription):
14    collection = tables.UInt8Col()
15    energy = tables.Float64Col()
16
17
18def fill_bucket(lbucket):
19    #c = np.random.normal(NCOLL/2, NCOLL/10, lbucket)
20    c = np.random.normal(NCOLL / 2, NCOLL / 100, lbucket)
21    e = np.arange(lbucket, dtype='f8')
22    return c, e
23
24# Fill the table
25t1 = time()
26f = tables.open_file("data.nobackup/collations.h5", "w")
27table = f.create_table("/", "Energies", Energies, expectedrows=N)
28# Fill the table with values
29lbucket = 1000   # Fill in buckets of 1000 rows, for speed
30for i in range(0, N, lbucket):
31    bucket = fill_bucket(lbucket)
32    table.append(bucket)
33# Fill the remaining rows
34bucket = fill_bucket(N % lbucket)
35table.append(bucket)
36f.close()
37print("Time to create the table with %d entries: %.3f" % (N, time() - t1))
38
39# Now, read the table and group it by collection
40f = tables.open_file("data.nobackup/collations.h5", "a")
41table = f.root.Energies
42
43#########################################################
44# First solution: load the table completely in memory
45#########################################################
46t1 = time()
47t = table[:]  # convert to structured array
48coll1 = []
49collections = np.unique(t['collection'])
50for c in collections:
51    cond = t['collection'] == c
52    energy_this_collection = t['energy'][cond]
53    sener = energy_this_collection.sum()
54    coll1.append(sener)
55    print(c, ' : ', sener)
56del collections, energy_this_collection
57print("Time for first solution: %.3f" % (time() - t1))
58
59#########################################################
60# Second solution: load all the collections in memory
61#########################################################
62t1 = time()
63collections = {}
64for row in table:
65    c = row['collection']
66    e = row['energy']
67    if c in collections:
68        collections[c].append(e)
69    else:
70        collections[c] = [e]
71# Convert the lists in numpy arrays
72coll2 = []
73for c in sorted(collections):
74    energy_this_collection = np.array(collections[c])
75    sener = energy_this_collection.sum()
76    coll2.append(sener)
77    print(c, ' : ', sener)
78del collections, energy_this_collection
79print("Time for second solution: %.3f" % (time() - t1))
80
81t1 = time()
82table.cols.collection.create_csindex()
83# table.cols.collection.reindex()
84print("Time for indexing: %.3f" % (time() - t1))
85
86#########################################################
87# Third solution: load each collection separately
88#########################################################
89t1 = time()
90coll3 = []
91for c in np.unique(table.col('collection')):
92    energy_this_collection = table.read_where(
93        'collection == c', field='energy')
94    sener = energy_this_collection.sum()
95    coll3.append(sener)
96    print(c, ' : ', sener)
97del energy_this_collection
98print("Time for third solution: %.3f" % (time() - t1))
99
100
101t1 = time()
102table2 = table.copy('/', 'EnergySortedByCollation', overwrite=True,
103                    sortby="collection", propindexes=True)
104print("Time for sorting: %.3f" % (time() - t1))
105
106#####################################################################
107# Fourth solution: load each collection separately.  Sorted table.
108#####################################################################
109t1 = time()
110coll4 = []
111for c in np.unique(table2.col('collection')):
112    energy_this_collection = table2.read_where(
113        'collection == c', field='energy')
114    sener = energy_this_collection.sum()
115    coll4.append(sener)
116    print(c, ' : ', sener)
117    del energy_this_collection
118print("Time for fourth solution: %.3f" % (time() - t1))
119
120
121# Finally, check that all solutions do match
122assert coll1 == coll2 == coll3 == coll4
123
124f.close()
125