1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3
4# Copyright (C) 2018  Mate Soos
5#
6# This program is free software; you can redistribute it and/or
7# modify it under the terms of the GNU General Public License
8# as published by the Free Software Foundation; version 2
9# of the License.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, write to the Free Software
18# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19# 02110-1301, USA.
20
21# pylint: disable=invalid-name,line-too-long,too-many-locals,consider-using-sys-exit
22
23import numpy as np
24import pandas as pd
25import matplotlib.pyplot as plt
26import sklearn
27import sklearn.metrics
28import re
29import ast
30import math
31import time
32import os.path
33import sqlite3
34import functools
35try:
36    from termcolor import cprint
37except ImportError:
38    termcolor_avail = False
39else:
40    termcolor_avail = True
41
42from pprint import pprint
43try:
44    import mlflow
45except ImportError:
46    mlflow_avail = False
47else:
48    mlflow_avail = True
49
50
51class QueryHelper:
52    def __init__(self, dbfname):
53        if not os.path.isfile(dbfname):
54            print("ERROR: Database file '%s' does not exist" % dbfname)
55            exit(-1)
56
57        self.conn = sqlite3.connect(dbfname)
58        self.c = self.conn.cursor()
59
60    def __enter__(self):
61        return self
62
63    def __exit__(self, exc_type, exc_value, traceback):
64        self.conn.commit()
65        self.conn.close()
66
67
68class QueryFill (QueryHelper):
69    def create_indexes(self, verbose=False, used_clauses="used_clauses"):
70        t = time.time()
71        print("Recreating indexes...")
72        queries = """
73        create index `idxclid6-4` on `reduceDB` (`clauseID`, `conflicts`)
74        create index `idxclidUCLS-2` on `{used_clauses}` ( `clauseID`, `used_at`);
75        create index `idxcl_last_in_solver-1` on `cl_last_in_solver` ( `clauseID`, `conflicts`);
76        """.format(used_clauses=used_clauses)
77        for l in queries.split('\n'):
78            t2 = time.time()
79
80            if verbose:
81                print("Creating index: ", l)
82            self.c.execute(l)
83            if verbose:
84                print("Index creation T: %-3.2f s" % (time.time() - t2))
85
86        print("indexes created T: %-3.2f s" % (time.time() - t))
87
88    def delete_and_create_all(self):
89        tables = ["short", "long", "forever"]
90
91        t = time.time()
92        q = """DROP TABLE IF EXISTS `used_later`;"""
93        self.c.execute(q)
94
95        for table in tables:
96            q = """
97                DROP TABLE IF EXISTS `used_later_{name}`;
98            """
99            self.c.execute(q.format(name=table))
100
101        # Create and fill used_later_X tables
102        q_create = """
103        create table `used_later_{name}` (
104            `clauseID` bigint(20) NOT NULL,
105            `rdb0conflicts` bigint(20) NOT NULL,
106            `used_later_{name}` bigint(20),
107            `offset` bigint(20) NOT NULL
108        );"""
109        self.c.execute(q_create.format(name="short"))
110        self.c.execute(q_create.format(name="long"))
111        self.c.execute(q_create.format(name="forever"))
112
113        # Create used_later table
114        t = time.time()
115        q = """
116        create table `used_later` (
117            `clauseID` bigint(20) NOT NULL,
118            `rdb0conflicts` bigint(20) NOT NULL,
119            `used_later` bigint(20)
120        );"""
121        self.c.execute(q)
122
123        idxs = """
124        create index `used_later_{name}_idx3` on `used_later_{name}` (used_later_{name});
125        create index `used_later_{name}_idx1` on `used_later_{name}` (`clauseID`, rdb0conflicts, offset);
126        create index `used_later_{name}_idx2` on `used_later_{name}` (`clauseID`, rdb0conflicts, used_later_{name}, offset);"""
127
128        for table in tables:
129            for l in idxs.format(name=table).split('\n'):
130                self.c.execute(l)
131
132        idxs = """
133        create index `used_later_idx3` on `used_later` (`used_later`);
134        create index `used_later_idx1` on `used_later` (`clauseID`, rdb0conflicts);
135        create index `used_later_idx2` on `used_later` (`clauseID`, rdb0conflicts, used_later);
136        """
137        t = time.time()
138        for l in idxs.split('\n'):
139            self.c.execute(l)
140
141        print("used_later* dropped and recreated T: %-3.2f s" % (time.time() - t))
142
143    def fill_used_later(self, used_clauses="used_clauses"):
144        # Fill used_later table
145        t = time.time()
146        q = """insert into used_later
147        (
148        `clauseID`,
149        `rdb0conflicts`,
150        `used_later`
151        )
152        SELECT
153        rdb0.clauseID
154        , rdb0.conflicts
155        , count(ucl.used_at) as `useful_later`
156        FROM
157        reduceDB as rdb0
158        left join {used_clauses} as ucl
159
160        -- for any point later than now
161        -- reduceDB is always present, used_later may not be, hence left join
162        on (ucl.clauseID = rdb0.clauseID
163            and ucl.used_at > rdb0.conflicts)
164
165        join cl_last_in_solver
166        on cl_last_in_solver.clauseID = rdb0.clauseID
167
168        WHERE
169        rdb0.clauseID != 0
170        and cl_last_in_solver.conflicts >= rdb0.conflicts
171
172        group by rdb0.conflicts, rdb0.clauseID;"""
173
174        self.c.execute(q.format(used_clauses=used_clauses))
175        print("used_later filled T: %-3.2f s" % (time.time() - t))
176
177    def fill_used_later_X(self, name, duration, offset=0,
178                          used_clauses="used_clauses",
179                          forever=False):
180        q_fill = """
181        insert into used_later_{name}
182        (
183        `clauseID`,
184        `rdb0conflicts`,
185        `used_later_{name}`,
186        `offset`
187        )
188        SELECT
189        rdb0.clauseID
190        , rdb0.conflicts
191        , count(ucl.used_at) as `used_later_{name}`
192        , {offset}
193
194        FROM
195        reduceDB as rdb0
196        left join {used_clauses} as ucl
197
198        -- reduceDB is always present, {used_clauses} may not be, hence left join
199        on (ucl.clauseID = rdb0.clauseID
200            and ucl.used_at > (rdb0.conflicts+{offset})
201            and ucl.used_at <= (rdb0.conflicts+{duration}+{offset}))
202
203        join cl_last_in_solver
204        on cl_last_in_solver.clauseID = rdb0.clauseID
205
206        WHERE
207        rdb0.clauseID != 0
208        and (cl_last_in_solver.conflicts >= (rdb0.conflicts + {duration} + {offset})
209        or 1=={forever})
210
211        group by rdb0.clauseID, rdb0.conflicts;"""
212
213        t = time.time()
214        self.c.execute(q_fill.format(
215            name=name, used_clauses=used_clauses,
216            duration=duration, offset=offset, forever=int(forever)))
217        print("used_later_%s filled T: %-3.2f s" %
218              (name, time.time() - t))
219
220
221def write_mit_header(f):
222    f.write("""/******************************************
223Copyright (c) 2018, Mate Soos
224
225Permission is hereby granted, free of charge, to any person obtaining a copy
226of this software and associated documentation files (the "Software"), to deal
227in the Software without restriction, including without limitation the rights
228to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
229copies of the Software, and to permit persons to whom the Software is
230furnished to do so, subject to the following conditions:
231
232The above copyright notice and this permission notice shall be included in
233all copies or substantial portions of the Software.
234
235THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
236IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
237FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
238AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
239LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
240OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
241THE SOFTWARE.
242***********************************************/\n\n""")
243
244def parse_configs(confs):
245    match = re.match(r"^([0-9]*)-([0-9]*)$", confs)
246    if not match:
247        print("ERROR: we cannot parse your config options: '%s'" % confs)
248        exit(-1)
249
250    conf_from = int(match.group(1))
251    conf_to = int(match.group(2))+1
252    if conf_to <= conf_from:
253        print("ERROR: Conf range is not increasing")
254        exit(-1)
255
256    print("Running configs:", range(conf_from, conf_to))
257    return conf_from, conf_to
258
259
260def get_features(fname):
261    best_features = []
262    check_file_exists(fname)
263    with open(fname, "r") as f:
264        for l in f:
265            l = l.strip()
266            if l != "":
267                best_features.append(l)
268
269    return best_features
270
271
272def helper_divide(dividend, divisor, df, features, verb, name=None):
273    """
274    to be used like:
275    import functools
276    divide = functools.partial(helper.divide, df=df, features=features, verb=options.verbose)
277    """
278
279    # dividend feature not present
280    #if dividend not in features:
281        #return None
282
283    # divisorfeature not present
284    #if divisor not in features:
285        #return None
286
287    # divide
288    if verb:
289        print("Dividing. dividend: '%s' divisor: '%s' " % (dividend, divisor))
290
291    if name is None:
292        name = "(" + dividend + "/" + divisor + ")"
293
294    df[name] = df[dividend].div(df[divisor]+0.000000001)
295    return name
296
297def helper_larger_than(lhs, rhs, df, features, verb):
298    """
299    to be used like:
300    import functools
301    larger_than = functools.partial(helper.larger_than, df=df, features=features, verb=options.verbose)
302    """
303
304    # divide
305    if verb:
306        print("Calulating '%s' >: '%s' " % (lhs, rhs))
307
308    name = "(" + lhs + ">" + rhs + ")"
309    df[name] = (df[lhs] > df[rhs]).astype(int)
310    return name
311
312def helper_add(toadd, df, features, verb):
313    """
314    to be used like:
315    import functools
316    larger_than = functools.partial(helper.larger_than, df=df, features=features, verb=options.verbose)
317    """
318
319    # add
320    if verb:
321        print("Calulating: the feature addition of: %s", toadd)
322
323    name = "("
324    for i in range(1, len(toadd)):
325        name = toadd[i]
326        if i < len(toadd)-1:
327            name+="+"
328    name += ")"
329
330    df[name] = df[toadd[0]]
331    for i in range(1, len(toadd)):
332        df[name] += df[toadd[i]]
333    return name
334
335
336def dangerous(conn):
337    conn.execute("PRAGMA journal_mode = MEMORY")
338    conn.execute("PRAGMA synchronous = OFF")
339
340
341def drop_idxs(conn):
342    q = """
343    SELECT name FROM sqlite_master WHERE type == 'index'
344    """
345    conn.execute(q)
346    rows = conn.fetchall()
347    queries = ""
348    for row in rows:
349        print("Will delete index:", row[0])
350        queries += "drop index if exists `%s`;\n" % row[0]
351
352    t = time.time()
353    for q in queries.split("\n"):
354        conn.execute(q)
355
356    print("Removed indexes: T: %-3.2f s"% (time.time() - t))
357
358
359def get_columns(tablename, verbose, conn):
360    q = "pragma table_info(%s);" % tablename
361    conn.execute(q)
362    rows = conn.fetchall()
363    columns = []
364    for row in rows:
365        if verbose:
366            print("Using column in table {tablename}: {col}".format(
367                tablename=tablename, col=row[1]))
368        columns.append(row[1])
369
370    return columns
371
372def query_fragment(tablename, not_cols, short_name, verbose, conn):
373    cols = get_columns(tablename, verbose, conn)
374    filtered_cols = list(set(cols).difference(not_cols))
375    ret = ""
376    for col in filtered_cols:
377        ret += ", {short_name}.`{col}` as `{short_name}.{col}`\n".format(
378            col=col, short_name=short_name)
379
380    if verbose:
381        print("query for short name {short_name}: {ret}".format(
382            short_name=short_name, ret=ret))
383
384    return ret
385
386
387def not_inside(not_these, inside_here):
388    for not_this in not_these:
389        if not_this in inside_here:
390            return False
391
392    return True
393
394def add_computed_szfeat_for_clustering(df):
395    print("Adding computed clustering features...")
396
397    todiv = []
398    for x in list(df):
399        not_these = ["std", "min", "mean", "_per_", "binary", "vcg", "pnr", "horn", "max"]
400        if "szfeat_cur" in x and x[-3:] != "var" and not_inside(not_these, x):
401            todiv.append(x)
402
403    # relative data
404    cols = list(df)
405    for col in cols:
406        if "szfeat_cur" in col:
407            for divper in todiv:
408                df["("+col+"/"+divper+")"] = df[col]/df[divper]
409                df["("+col+"<"+divper+")"] = (df[col] < df[divper]).astype(int)
410
411    print("Added computed features.")
412
413
414# to check for too large or NaN values:
415def check_too_large_or_nan_values(df, features):
416    print("Checking for too large or NaN values...")
417    # features = df.columns.values.flatten().tolist()
418    index = 0
419    for index, row in df[features].iterrows():
420        for x, name in zip(row, features):
421            try:
422                np.isfinite(x)
423            except:
424                print("Name:", name)
425                print("Prolbem with value:", x)
426                print(row)
427
428            if not np.isfinite(x) or x > np.finfo(np.float32).max:
429                print("issue with data for features: ", name, x)
430            index += 1
431
432    print("Checking finished.")
433
434
435def print_confusion_matrix(cm,
436                           normalize=False,
437                           title='Confusion matrix',
438                           cmap=plt.cm.Blues):
439    """
440    This function prints and plots the confusion matrix.
441    Normalization can be applied by setting `normalize=True`.
442    """
443    if normalize:
444        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
445
446    print(title)
447    if mlflow_avail:
448        mlflow.log_metric(title, cm[0][0])
449    np.set_printoptions(precision=2)
450    print(cm)
451
452
453def calc_min_split_point(df, min_samples_split):
454    split_point = int(float(df.shape[0])*min_samples_split)
455    if split_point < 10:
456        split_point = 10
457    print("Minimum split point: ", split_point)
458    return split_point
459
460
461def calc_regression_error(data, features, to_predict, clf, toprint,
462                  average="binary", highlight=False):
463    X_data = data[features]
464    y_data = data[to_predict]
465    print("Number of elements:", X_data.shape)
466    if data.shape[0] <= 1:
467        print("Cannot calculate regression error, too few elements")
468        return None
469    y_pred = clf.predict(X_data)
470    main_error = sklearn.metrics.mean_squared_error(y_data, y_pred)
471    print("Mean squared error is: ", main_error)
472
473    for start,end in [(0,10), (1,10), (10, 100), (100, 1000), (1000,10000), (10000, 1000000)]:
474        x = "--> Strata  %10d <= %20s < %10d " % (start, to_predict, end)
475        myfilt = data[(data[to_predict] >= start) & (data[to_predict] < end)]
476        X_data = myfilt[features]
477        y_data = myfilt[to_predict]
478        y = " -- elements: {:20}".format(str(X_data.shape))
479        if myfilt.shape[0] <= 1:
480            #print("Cannot calculate regression error, too few elements")
481            error = -1
482        else:
483            y_pred = clf.predict(X_data)
484            error = sklearn.metrics.mean_squared_error(y_data, y_pred)
485        print("%s %s msqe: %13.1lf" % (x, y, error))
486
487    for start,end in [(0,3), (3,8), (8, 15), (15, 25), (25,50), (50, 100), (100, 1000000)]:
488        x = "--> Strata  %10d <= %20s < %10d " % (start, "rdb0.glue", end)
489        myfilt = data[(data["rdb0.glue"] >= start) & (data["rdb0.glue"] < end)]
490        X_data = myfilt[features]
491        y_data = myfilt[to_predict]
492        y = " -- elements: {:20}".format(str(X_data.shape))
493        if myfilt.shape[0] <= 1:
494            error = -1
495        else:
496            y_pred = clf.predict(X_data)
497            error = sklearn.metrics.mean_squared_error(y_data, y_pred)
498        print("%s %s msqe: %13.1lf" % (x, y, error))
499
500    return main_error
501
502
503def conf_matrixes(data, features, to_predict, clf, toprint,
504                  average="binary", highlight=False):
505    # get data
506    X_data = data[features]
507    y_data = data[to_predict]
508    print("Number of elements:", X_data.shape)
509    if data.shape[0] <= 1:
510        print("Cannot calculate confusion matrix, too few elements")
511        return None, None, None, None
512
513    # Preform prediction
514    def f(x):
515        if x > 0.5:
516            return 1
517        else:
518            return 0
519
520    y_pred = clf.predict(X_data)
521    #print("type(y_pred[0]): ",  type(y_pred[0]))
522    if type(y_pred[0]) == np.float32:
523        y_pred = np.array([f(x) for x in y_pred])
524
525    # calc acc, precision, recall
526    accuracy = sklearn.metrics.accuracy_score(
527        y_data, y_pred)
528
529    precision = sklearn.metrics.precision_score(
530        y_data, y_pred, pos_label=1, average=average)
531
532
533    recall = sklearn.metrics.recall_score(
534        y_data, y_pred, pos_label=1, average=average)
535
536    # ROC AUC
537    predsi = np.array(y_pred)
538    y_testi = pd.DataFrame(y_data)["x.class"].squeeze()
539    try:
540        roc_auc = sklearn.metrics.roc_auc_score(y_testi, predsi)
541    except:
542        print("NOTE: ROC AUC is set to 0 because of completely one-sided OK/BAD")
543        roc_auc = 0
544
545    # record to mlflow
546    if mlflow_avail:
547        mlflow.log_metric(toprint + " -- accuracy", accuracy)
548        mlflow.log_metric(toprint + " -- precision", precision)
549        mlflow.log_metric(toprint + " -- recall", recall)
550        mlflow.log_metric(toprint + " -- roc_auc", roc_auc)
551
552    color = "white"
553    bckgrnd = "on_grey"
554    if highlight:
555        color="green"
556        bckgrnd = "on_grey"
557
558    txt = "%s prec : %-3.4f  recall: %-3.4f accuracy: %-3.4f roc_auc: %-3.4f"
559    vals = (toprint, precision, recall, accuracy, roc_auc)
560    if termcolor_avail:
561        cprint(txt % vals , color, bckgrnd)
562    else:
563        cprint(txt % vals)
564
565    # Plot confusion matrix
566    cnf_matrix = sklearn.metrics.confusion_matrix(
567        y_true=y_data, y_pred=y_pred)
568    print_confusion_matrix(
569        cnf_matrix,
570        title='Confusion matrix without normalization -- %s' % toprint)
571    print_confusion_matrix(
572        cnf_matrix, normalize=True,
573        title='Normalized confusion matrix -- %s' % toprint)
574
575    return roc_auc
576
577
578def check_file_exists(fname):
579    try:
580        f = open(fname)
581    except IOError:
582        print("File '%s' not accessible" % fname)
583        exit(-1)
584    finally:
585        f.close()
586
587
588def clear_data_from_str(df):
589    values2nums = {'luby': 0, 'glue': 1, 'geom': 2}
590    df.loc[:, ('cl.cur_restart_type')] = \
591        df.loc[:, ('cl.cur_restart_type')].map(values2nums)
592
593    df.loc[:, ('rdb0.cur_restart_type')] = \
594        df.loc[:, ('rdb0.cur_restart_type')].map(values2nums)
595
596    df.loc[:, ('rst_cur.restart_type')] = \
597        df.loc[:, ('rst_cur.restart_type')].map(values2nums)
598
599    if "rdb1.cur_restart_type" in df:
600        df.loc[:, ('rdb1.cur_restart_type')] = \
601            df.loc[:, ('rdb1.cur_restart_type')].map(values2nums)
602
603
604def output_to_classical_dot(clf, features, fname):
605    sklearn.tree.export_graphviz(clf, out_file=fname,
606                                 feature_names=features,
607                                 class_names=clf.classes_,
608                                 filled=True, rounded=True,
609                                 special_characters=True,
610                                 proportion=True)
611    print("Run dot:")
612    print("dot -Tpng {fname} -o {fname}.png".format(fname=fname))
613    print("gwenview {fname}.png".format(fname=fname))
614
615
616def print_feature_ranking(clf, X_train, top_num_features, features, plot=False):
617    best_features = []
618    importances = clf.feature_importances_
619    std = np.std(
620        [tree.feature_importances_ for tree in clf.estimators_], axis=0)
621    indices = np.argsort(importances)[::-1]
622    indices = indices[:top_num_features]
623    myrange = min(X_train.shape[1], top_num_features)
624
625    # Print the feature ranking
626    print("Feature ranking:")
627
628    for f in range(myrange):
629        print("%-3d  %-55s -- %8.4f" %
630              (f + 1, features[indices[f]], importances[indices[f]]))
631        best_features.append(features[indices[f]])
632
633    # Plot the feature importances of the clf
634    if plot:
635        plot_feature_importances(importances, indices, myrange, std, features)
636
637    return best_features
638
639
640def plot_feature_importances(importances, indices, myrange, std, features):
641        plt.figure()
642        plt.title("Feature importances")
643        plt.bar(range(myrange), importances[indices],
644                color="r", align="center",
645                yerr=std[indices])
646        plt.xticks(range(myrange), [features[x]
647                                    for x in indices], rotation=45)
648        plt.xlim([-1, myrange])
649
650
651def cldata_add_computed_features(df, verbose):
652    print("Adding computed features...")
653    del df["cl.conflicts"]
654    divide = functools.partial(helper_divide, df=df, features=list(df), verb=verbose)
655    larger_than = functools.partial(helper_larger_than, df=df, features=list(df), verb=verbose)
656    add = functools.partial(helper_add, df=df, features=list(df), verb=verbose)
657
658    # relative overlaps
659    print("Relative overlaps...")
660    divide("cl.num_total_lits_antecedents", "cl.antec_sum_size_hist")
661
662    # deleting this feature which is NONE
663    del df["cl.antecedents_glue_long_reds_avg"]
664    del df["cl.antecedents_glue_long_reds_max"]
665    del df["cl.antecedents_glue_long_reds_min"]
666    del df["cl.antecedents_glue_long_reds_var"]
667    del df["cl.antecedents_long_red_age_avg"]
668    del df["cl.antecedents_long_red_age_var"]
669    del df["cl.decision_level_hist"]
670    del df["sum_cl_use.first_confl_used"]
671    del df["sum_cl_use.last_confl_used"]
672
673    # ************
674    # TODO decision level and branch depth are the same, right???
675    # ************
676    print("size/glue/trail rel...")
677    divide("cl.trail_depth_level", "cl.trail_depth_level_hist")
678
679    rst_cur_all_props = add(["rst_cur.propBinRed",
680                            "rst_cur.propBinIrred",
681                            "rst_cur.propLongRed",
682                            "rst_cur.propLongIrred"])
683
684    divide("cl.num_total_lits_antecedents", "cl.num_antecedents")
685
686    # sum RDB
687    orig_cols = list(df)
688    for col in orig_cols:
689        if ("rdb0" in col) and "restart_type" not in col:
690            col2 = col.replace("rdb0", "rdb1")
691            cboth = "("+col+"+"+col2+")"
692            df[cboth] = df[col]+df[col2]
693
694    rdb0_act_ranking_rel = divide("rdb0.act_ranking", "rdb0.tot_cls_in_db", name="rdb0_act_ranking_rel")
695    rdb1_act_ranking_rel = divide("rdb1.act_ranking", "rdb1.tot_cls_in_db", name="rdb1_act_ranking_rel")
696    rdb0_plus_rdb1_ranking_rel = add([rdb0_act_ranking_rel, rdb1_act_ranking_rel])
697
698    divide("rdb0.sum_uip1_used", "cl.time_inside_solver")
699    divide("rdb1.sum_uip1_used", "cl.time_inside_solver")
700    divide("rdb0.sum_propagations_made", "cl.time_inside_solver")
701    divide("rdb1.sum_propagations_made", "cl.time_inside_solver")
702
703    divisors = [
704        "cl.size_hist"
705        , "cl.glue_hist"
706        , "rdb0.glue"
707        , "cl.orig_glue"
708        , "cl.glue_before_minim"
709        , "cl.glue_hist_queue"
710        , "cl.glue_hist_long"
711        # , "cl.decision_level_hist"
712        , "cl.num_resolutions_hist_lt"
713        # , "cl.trail_depth_level_hist"
714        # , "cl.backtrack_level_hist"
715        , "cl.branch_depth_hist_queue"
716        , "cl.antec_overlap_hist"
717        , "(cl.num_total_lits_antecedents/cl.num_antecedents)"
718        , "cl.num_antecedents"
719        , rdb0_act_ranking_rel
720        , rdb1_act_ranking_rel
721        #, "szfeat_cur.var_cl_ratio"
722        , "cl.time_inside_solver"
723        #, "((double)(rdb0.act_ranking_rel+rdb1.act_ranking_rel)/2.0)"
724        #, "sqrt(rdb0.act_ranking_rel)"
725        #, "sqrt(rdb1.act_ranking_rel)"
726        #, "sqrt(rdb0_and_rdb1.act_ranking_rel_avg)"
727        # , "cl.num_overlap_literals"
728        # , "rst_cur.resolutions"
729        #, "rdb0.act_ranking_top_10"
730        ]
731
732    # Thanks to Chai Kian Ming Adam for the idea of using LOG instead of SQRT
733    # add LOG
734    if True:
735        toadd = []
736        for divisor in divisors:
737            x = "log2("+divisor+")"
738            df[x] = df[divisor].apply(np.log2)
739            toadd.append(x)
740        divisors.extend(toadd)
741
742    # relative data
743    cols = list(df)
744    for col in cols:
745        if ("rdb" in col or "cl." in col or "rst" in col) and "restart_type" not in col and "tot_cls_in" not in col and "rst_cur" not in col:
746            for divisor in divisors:
747                divide(divisor, col)
748                divide(col, divisor)
749
750    divisors.extend([
751        rst_cur_all_props
752        , "rdb0.last_touched_diff"
753        , "rdb0.used_for_uip_creation"
754        , "rdb1.used_for_uip_creation"
755        , "rdb0.propagations_made"
756        , "rdb1.propagations_made"
757        , "rdb0.sum_propagations_made"
758        , "(rdb0.sum_propagations_made/cl.time_inside_solver)"
759        , "(rdb1.sum_propagations_made/cl.time_inside_solver)"
760        , "(rdb0.sum_uip1_used/cl.time_inside_solver)"
761        , "(rdb1.sum_uip1_used/cl.time_inside_solver)"])
762
763    # smaller/larger than
764    if False:
765        for col in cols:
766            if ("rdb" in col or "cl." in col or "rst" in col) and "restart_type" not in col:
767                for divisor in divisors:
768                    larger_than(col, divisor)
769
770    # satzilla stuff
771    if False:
772        divisors = [
773            "szfeat_cur.numVars",
774            "szfeat_cur.numClauses",
775            "szfeat_cur.var_cl_ratio",
776            "szfeat_cur.avg_confl_size",
777            "szfeat_cur.avg_branch_depth",
778            "szfeat_cur.red_glue_distr_mean"
779        ]
780        for col in orig_cols:
781            if "szfeat" in col:
782                for divisor in divisors:
783                    if "min" not in divisor:
784                        divide(col, divisor)
785                        larger_than(col, divisor)
786
787    # relative RDB
788    if True:
789        print("Relative RDB...")
790        for col in orig_cols:
791            if "rdb0" in col and "restart_type" not in col:
792                rdb0 = col
793                rdb1 = col.replace("rdb0", "rdb1")
794                larger_than(rdb0, rdb1)
795
796                raw_col = col.replace("rdb0.", "")
797                if raw_col not in ["sum_propagations_made", "propagations_made", "dump_no", "conflicts_made", "used_for_uip_creation", "sum_uip1_used", "activity_rel", "last_touched_diff", "ttl"]:
798                    print(rdb0)
799                    divide(rdb0, rdb1)
800
801    # smaller-or-greater comparisons
802    print("smaller-or-greater comparisons...")
803    larger_than("cl.antec_sum_size_hist", "cl.num_total_lits_antecedents")
804    larger_than("cl.antec_overlap_hist", "cl.num_overlap_literals")
805
806    # print("flatten/list...")
807    #old = set(df.columns.values.flatten().tolist())
808    #df = df.dropna(how="all")
809    #new = set(df.columns.values.flatten().tolist())
810    #if len(old - new) > 0:
811        #print("ERROR: a NaN number turned up")
812        #print("columns: ", (old - new))
813        #assert(False)
814        #exit(-1)
815