xdelta3/testing/xdelta3-regtest.py

#!/usr/bin/python2.6
# xdelta 3 - delta compression tools and library
# Copyright (C) 2003, 2006, 2007, 2008.  Joshua P. MacDonald
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# TODO: test 1.5 vs. greedy

import os, sys, math, re, time, types, array, random
import xdelta3

#RCSDIR = '/mnt/polaroid/Polaroid/orbit_linux/home/jmacd/PRCS'
#RCSDIR = '/tmp/PRCS_read_copy'
#SAMPLEDIR = "/tmp/WESNOTH_tmp/diff"

#RCSDIR = 'G:/jmacd/PRCS_copy'
#SAMPLEDIR = "C:/sample_data/Wesnoth/tar"

RCSDIR = '/Users/jmacd/src/ftp.kernel.org'
SAMPLEDIR = '/Users/jmacd/src/xdelta3/linux'

#
MIN_SIZE       = 0

TIME_TOO_SHORT = 0.050

SKIP_TRIALS    = 2
MIN_TRIALS     = 3
MAX_TRIALS     = 15

# 10 = fast 1.5 = slow
MIN_STDDEV_PCT = 1.5

# How many results per round
MAX_RESULTS = 500
TEST_ROUNDS = 10
KEEP_P = (0.5)

# For RCS testing, what percent to select
FILE_P = (0.50)

# For run-speed tests
MIN_RUN = 1000 * 1000 * 1
MAX_RUN = 1000 * 1000 * 10

# Testwide defaults
ALL_ARGS = [
    '-q'  # '-vv'
    ]

# The first 7 args go to -C
SOFT_CONFIG_CNT = 7

CONFIG_ORDER = [ 'large_look',
                 'large_step',
                 'small_look',
                 'small_chain',
                 'small_lchain',
                 'max_lazy',
                 'long_enough',

                 # > SOFT_CONFIG_CNT
                 'nocompress',
                 'winsize',
                 'srcwinsize',
                 'sprevsz',
                 'iopt',
                 'djw',
                 'altcode',
                 ]

CONFIG_ARGMAP = {
    'winsize'    : '-W',
    'srcwinsize' : '-B',
    'sprevsz'    : '-P',
    'iopt'       : '-I',
    'nocompress' : '-N',
    'djw'        : '-Sdjw',
    'altcode'    : '-T',
    }

def INPUT_SPEC(rand):
    return {

    # Time/space costs:

    # -C 1,2,3,4,5,6,7
    'large_look' : lambda d: rand.choice([9, 10, 11, 12]),
    'large_step' : lambda d: rand.choice([25, 26, 27, 28, 29, 30]),
    'small_look'   : lambda d: rand.choice([4]),
    'small_chain'  : lambda d: rand.choice([1]),
    'small_lchain' : lambda d: rand.choice([1]),
    'max_lazy'     : lambda d: rand.choice([4, 5, 6, 7, 8, 9, 10 ]),

    # Note: long_enough only refers to small matching and has no effect if
    # small_chain == 1.
    'long_enough'  : lambda d: rand.choice([4]),

    # -N
    'nocompress'   : lambda d: rand.choice(['false']),

    # -T
    'altcode'      : lambda d: rand.choice(['false']),

    # -S djw
    'djw'          : lambda d: rand.choice(['false']),

    # Memory costs:

    # -W
    'winsize'      : lambda d: 8 * (1<<20),

    # -B
    'srcwinsize'   : lambda d: 64 * (1<<20),

    # -I 0 is unlimited
    'iopt'         : lambda d: 0,

    # -P only powers of two
    'sprevsz'      : lambda d: rand.choice([x * (1<<16) for x in [4]]),
  }
#end

#
TMPDIR = '/tmp/xd3regtest.%d' % os.getpid()

RUNFILE = os.path.join(TMPDIR, 'run')
DFILE   = os.path.join(TMPDIR, 'output')
RFILE   = os.path.join(TMPDIR, 'recon')
CMPTMP1 = os.path.join(TMPDIR, 'cmptmp1')
CMPTMP2 = os.path.join(TMPDIR, 'cmptmp2')

HEAD_STATE = 0
BAR_STATE  = 1
REV_STATE  = 2
DATE_STATE = 3

#
IGNORE_FILENAME  = re.compile('.*\\.(gif|jpg).*')

# rcs output
RE_TOTREV  = re.compile('total revisions: (\\d+)')
RE_BAR     = re.compile('----------------------------')
RE_REV     = re.compile('revision (.+)')
RE_DATE    = re.compile('date: ([^;]+);.*')
# xdelta output
RE_HDRSZ   = re.compile('VCDIFF header size: +(\\d+)')
RE_EXTCOMP = re.compile('XDELTA ext comp.*')

def c2str(c):
    return ' '.join(['%s' % x for x in c])
#end

def SumList(l):
    return reduce(lambda x,y: x+y, l)
#end

# returns (total, mean, stddev, q2 (median),
#          (q3-q1)/2 ("semi-interquartile range"), max-min (spread))
class StatList:
    def __init__(self,l,desc):
        cnt = len(l)
        assert(cnt > 1)
        l.sort()
        self.cnt    = cnt
        self.l      = l
        self.total  = SumList(l)
        self.mean   = self.total / float(self.cnt)
        self.s      = math.sqrt(SumList([(x-self.mean) *
                                         (x - self.mean) for x in l]) /
                                float(self.cnt-1))
        self.q0     = l[0]
        self.q1     = l[int(self.cnt/4.0+0.5)]
        self.q2     = l[int(self.cnt/2.0+0.5)]
        self.q3     = l[min(self.cnt-1,int((3.0*self.cnt)/4.0+0.5))]
        self.q4     = l[self.cnt-1]
        self.siqr   = (self.q3-self.q1)/2.0;
        self.spread = (self.q4-self.q0)
        if len(l) == 1:
            self.str = '%s %s' % (desc, l[0])
        else:
            self.str = '%s mean %.1f: 25%-ile %d %d %d %d %d' % \
                (desc, self.mean, self.q0, self.q1, self.q2, self.q3, self.q4)
    #end
#end

def RunCommand(args, ok = [0]):
    #print 'run command %s' % (' '.join(args))
    p = os.spawnvp(os.P_WAIT, args[0], args)
    if p not in ok:
        raise CommandError(args, 'exited %d' % p)
    #end
#end

def RunCommandIO(args,infn,outfn):
    p = os.fork()
    if p == 0:
        os.dup2(os.open(infn,os.O_RDONLY),0)
        os.dup2(os.open(outfn,os.O_CREAT|os.O_TRUNC|os.O_WRONLY),1)
        os.execvp(args[0], args)
    else:
        s = os.waitpid(p,0)
        o = os.WEXITSTATUS(s[1])
        if not os.WIFEXITED(s[1]) or o != 0:
            raise CommandError(args, 'exited %d' % o)
        #end
    #end
#end

class TimedTest:
    def __init__(self, target, source, runnable,
                 skip_trials = SKIP_TRIALS,
                 min_trials = MIN_TRIALS,
                 max_trials = MAX_TRIALS,
                 min_stddev_pct = MIN_STDDEV_PCT):
        self.target = target
        self.source = source
        self.runnable = runnable

        self.skip_trials = skip_trials
        self.min_trials = min(min_trials, max_trials)
        self.max_trials = max_trials
        self.min_stddev_pct = min_stddev_pct

        self.encode_time = self.DoTest(DFILE,
                                       lambda x: x.Encode(self.target,
                                                          self.source, DFILE))
        self.encode_size = runnable.EncodeSize(DFILE)

        self.decode_time = self.DoTest(RFILE,
                                       lambda x: x.Decode(DFILE,
                                                          self.source, RFILE),
                                       )
        runnable.Verify(self.target, RFILE)
    #end

    def DoTest(self, fname, func):
        trials   = 0
        measured = []

        while 1:
            try:
                os.remove(fname)
            except OSError:
                pass

            start_time  = time.time()
            start_clock = time.clock()

            func(self.runnable)

            total_clock = (time.clock() - start_clock)
            total_time  = (time.time() - start_time)

            elap_time  = max(total_time,  0.0000001)
            elap_clock = max(total_clock, 0.0000001)

            trials = trials + 1

            # skip some of the first trials
            if trials > self.skip_trials:
                measured.append((elap_clock, elap_time))
                #print 'measurement total: %.1f ms' % (total_time * 1000.0)

            # at least so many
            if trials < (self.skip_trials + self.min_trials):
                #print 'continue: need more trials: %d' % trials
                continue

            # compute %variance
            done = 0
            if self.skip_trials + self.min_trials <= 2:
                measured = measured + measured;
                done = 1
            #end

            time_stat = StatList([x[1] for x in measured], 'elap time')
            sp = float(time_stat.s) / float(time_stat.mean)

            # what if MAX_TRIALS is exceeded?
            too_many = (trials - self.skip_trials) >= self.max_trials
            good = (100.0 * sp) < self.min_stddev_pct
            if done or too_many or good:
                trials = trials - self.skip_trials
                if not done and not good:
                    #print 'too many trials: %d' % trials
                    pass
                #clock = StatList([x[0] for x in measured], 'elap clock')
                return time_stat
            #end
        #end
    #end
#end

def Decimals(start, end):
    l = []
    step = start
    while 1:
        r = range(step, step * 10, step)
        l = l + r
        if step * 10 >= end:
            l.append(step * 10)
            break
        step = step * 10
    return l
#end

# This tests the raw speed of 0-byte inputs
def RunSpeedTest():
    for L in Decimals(MIN_RUN, MAX_RUN):
        SetFileSize(RUNFILE, L)

        trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<20)]))
        ReportSpeed(L, trx, '1MB ')

        trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<19)]))
        ReportSpeed(L, trx, '512k')

        trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<18)]))
        ReportSpeed(L, trx, '256k')

        trm = TimedTest(RUNFILE, None, Xdelta3Mod1(RUNFILE))
        ReportSpeed(L, trm, 'swig')

        trg = TimedTest(RUNFILE, None, GzipRun1())
        ReportSpeed(L,trg,'gzip')
    #end
#end

def SetFileSize(F,L):
    fd = os.open(F, os.O_CREAT | os.O_WRONLY)
    os.ftruncate(fd,L)
    assert os.fstat(fd).st_size == L
    os.close(fd)
#end

def ReportSpeed(L,tr,desc):
    print '%s run length %u: size %u: time %.3f ms: decode %.3f ms' % \
          (desc, L,
           tr.encode_size,
           tr.encode_time.mean * 1000.0,
           tr.decode_time.mean * 1000.0)
#end

class Xdelta3RunClass:
    def __init__(self, extra):
        self.extra = extra
    #end

    def __str__(self):
        return ' '.join(self.extra)
    #end

    def New(self):
        return Xdelta3Runner(self.extra)
    #end
#end

class Xdelta3Runner:
    # Use "forkexec" to get special command-line only features like
    # external compression support.
    def __init__(self, extra, forkexec=False):
        self.forkexec = forkexec
        self.extra = extra
    #end

    def Encode(self, target, source, output):
        args = (ALL_ARGS +
                self.extra +
                ['-e'])
        if source:
            args.append('-s')
            args.append(source)
        #end
        args = args + [target, output]
        self.Main(args)
    #end

    def Decode(self, input, source, output):
        args = (ALL_ARGS +
                ['-d'])
        if source:
            args.append('-s')
            args.append(source)
        #end
        args = args + [input, output]
        self.Main(args)
    #end

    def Verify(self, target, recon):
        if target[-3:] == ".gz":
            RunCommandIO(('gzip', '-dc'), target, CMPTMP1)
            RunCommandIO(('gzip', '-dc'), recon, CMPTMP2)
            RunCommand(('cmp', CMPTMP1, CMPTMP2))
        else:
            RunCommand(('cmp', target, recon))
    #end

    def EncodeSize(self, output):
        return os.stat(output).st_size
    #end

    def Main(self, args):
        try:
            if self.forkexec:
                RunCommand(['../xdelta3'] + args)
            else:
                xdelta3.xd3_main_cmdline(args)
        except Exception, e:
            raise CommandError(args, "xdelta3.main exception: %s" % e)
        #end
    #end
#end

class Xdelta3Mod1:
    def __init__(self, file):
        self.target_data = open(file, 'r').read()
    #end

    def Encode(self, ignore1, ignore2, ignore3):
        r1, encoded = xdelta3.xd3_encode_memory(self.target_data, None, 1000000, 1<<10)
        if r1 != 0:
            raise CommandError('memory', 'encode failed: %s' % r1)
        #end
        self.encoded = encoded
    #end

    def Decode(self, ignore1, ignore2, ignore3):
        r2, data1 = xdelta3.xd3_decode_memory(self.encoded, None, len(self.target_data))
        if r2 != 0:
            raise CommandError('memory', 'decode failed: %s' % r1)
        #end
        self.decoded = data1
    #end

    def Verify(self, ignore1, ignore2):
        if self.target_data != self.decoded:
            raise CommandError('memory', 'bad decode')
        #end
    #end

    def EncodeSize(self, ignore1):
        return len(self.encoded)
    #end
#end

class GzipRun1:
    def Encode(self, target, source, output):
        assert source == None
        RunCommandIO(['gzip', '-cf'], target, output)
    #end

    def Decode(self, input, source, output):
        assert source == None
        RunCommandIO(['gzip', '-dcf'], input, output)
    #end

    def Verify(self, target, recon):
        RunCommand(('cmp', target, recon))
    #end

    def EncodeSize(self, output):
        return os.stat(output).st_size
    #end
#end

class Xdelta1RunClass:
    def __str__(self):
        return 'xdelta1'
    #end

    def New(self):
        return Xdelta1Runner()
    #end
#end

class Xdelta1Runner:
    def Encode(self, target, source, output):
        assert source != None
        args = ['xdelta1', 'delta', '-q', source, target, output]
        RunCommand(args, [0, 1])
    #end

    def Decode(self, input, source, output):
        assert source != None
        args = ['xdelta1', 'patch', '-q', input, source, output]
        # Note: for dumb historical reasons, xdelta1 returns 1 or 0
        RunCommand(args)
    #end

    def Verify(self, target, recon):
        RunCommand(('cmp', target, recon))
    #end

    def EncodeSize(self, output):
        return os.stat(output).st_size
    #end
#end

# exceptions
class SkipRcsException:
    def __init__(self,reason):
        self.reason = reason
    #end
#end

class NotEnoughVersions:
    def __init__(self):
        pass
    #end
#end

class CommandError:
    def __init__(self,cmd,str):
        if type(cmd) is types.TupleType or \
           type(cmd) is types.ListType:
            cmd = reduce(lambda x,y: '%s %s' % (x,y),cmd)
        #end
        print 'command was: ',cmd
        print 'command failed: ',str
        print 'have fun debugging'
    #end
#end

class RcsVersion:
    def __init__(self,vstr):
        self.vstr = vstr
    #end
    def __cmp__(self,other):
        return cmp(self.date, other.date)
    #end
    def __str__(self):
        return str(self.vstr)
    #end
#end

class RcsFile:

    def __init__(self, fname):
        self.fname    = fname
        self.versions = []
        self.state    = HEAD_STATE
    #end

    def SetTotRev(self,s):
        self.totrev = int(s)
    #end

    def Rev(self,s):
        self.rev = RcsVersion(s)
        if len(self.versions) >= self.totrev:
            raise SkipRcsException('too many versions (in log messages)')
        #end
        self.versions.append(self.rev)
    #end

    def Date(self,s):
        self.rev.date = s
    #end

    def Match(self, line, state, rx, gp, newstate, f):
        if state == self.state:
            m = rx.match(line)
            if m:
                if f:
                    f(m.group(gp))
                #end
                self.state = newstate
                return 1
            #end
        #end
        return None
    #end

    def Sum1Rlog(self):
        f = os.popen('rlog '+self.fname, "r")
        l = f.readline()
        while l:
            if self.Match(l, HEAD_STATE, RE_TOTREV, 1, BAR_STATE, self.SetTotRev):
                pass
            elif self.Match(l, BAR_STATE, RE_BAR, 1, REV_STATE, None):
                pass
            elif self.Match(l, REV_STATE, RE_REV, 1, DATE_STATE, self.Rev):
                pass
            elif self.Match(l, DATE_STATE, RE_DATE, 1, BAR_STATE, self.Date):
                pass
            #end
            l = f.readline()
        #end
        c = f.close()
        if c != None:
            raise c
        #end
    #end

    def Sum1(self):
        st = os.stat(self.fname)
        self.rcssize = st.st_size
        self.Sum1Rlog()
        if self.totrev != len(self.versions):
            raise SkipRcsException('wrong version count')
        #end
        self.versions.sort()
    #end

    def Checkout(self,n):
        v      = self.versions[n]
        out    = open(self.Verf(n), "w")
        cmd    = 'co -ko -p%s %s' % (v.vstr, self.fname)
        total  = 0
        (inf,
         stream,
         err)  = os.popen3(cmd, "r")
        inf.close()
        buf    = stream.read()
        while buf:
            total = total + len(buf)
            out.write(buf)
            buf = stream.read()
        #end
        v.vsize = total
        estr = ''
        buf = err.read()
        while buf:
            estr = estr + buf
            buf = err.read()
        #end
        if stream.close():
            raise CommandError(cmd, 'checkout failed: %s\n%s\n%s' % (v.vstr, self.fname, estr))
        #end
        out.close()
        err.close()
    #end

    def Vdate(self,n):
        return self.versions[n].date
    #end

    def Vstr(self,n):
        return self.versions[n].vstr
    #end

    def Verf(self,n):
        return os.path.join(TMPDIR, 'input.%d' % n)
    #end

    def FilePairsByDate(self, runclass):
        if self.totrev < 2:
            raise NotEnoughVersions()
        #end
        self.Checkout(0)
        ntrials = []
        if self.totrev < 2:
            return vtrials
        #end
        for v in range(0,self.totrev-1):
            if v > 1:
                os.remove(self.Verf(v-1))
            #end
            self.Checkout(v+1)
            if os.stat(self.Verf(v)).st_size < MIN_SIZE or \
               os.stat(self.Verf(v+1)).st_size < MIN_SIZE:
                continue
            #end

            result = TimedTest(self.Verf(v+1),
                               self.Verf(v),
                               runclass.New())

            target_size = os.stat(self.Verf(v+1)).st_size

            ntrials.append(result)
        #end

        os.remove(self.Verf(self.totrev-1))
        os.remove(self.Verf(self.totrev-2))
        return ntrials
    #end

    def AppendVersion(self, f, n):
        self.Checkout(n)
        rf = open(self.Verf(n), "r")
        data = rf.read()
        f.write(data)
        rf.close()
        return len(data)
    #end

class RcsFinder:
    def __init__(self):
        self.subdirs  = []
        self.rcsfiles = []
        self.others   = []
        self.skipped  = []
        self.biground = 0
    #end

    def Scan1(self,dir):
        dents = os.listdir(dir)
        subdirs  = []
        rcsfiles = []
        others   = []
        for dent in dents:
            full = os.path.join(dir, dent)
            if os.path.isdir(full):
                subdirs.append(full)
            elif dent[len(dent)-2:] == ",v":
                rcsfiles.append(RcsFile(full))
            else:
                others.append(full)
            #end
        #end
        self.subdirs  = self.subdirs  + subdirs
        self.rcsfiles = self.rcsfiles + rcsfiles
        self.others   = self.others   + others
        return subdirs
    #end

    def Crawl(self, dir):
        subdirs = [dir]
        while subdirs:
            s1 = self.Scan1(subdirs[0])
            subdirs = subdirs[1:] + s1
        #end
    #end

    def Summarize(self):
        good = []
        for rf in self.rcsfiles:
            try:
                rf.Sum1()
                if rf.totrev < 2:
                    raise SkipRcsException('too few versions (< 2)')
                #end
            except SkipRcsException, e:
                #print 'skipping file %s: %s' % (rf.fname, e.reason)
                self.skipped.append(rf)
            else:
                good.append(rf)
            #end
        self.rcsfiles = good
    #end

    def AllPairsByDate(self, runclass):
        results = []
        good = []
        for rf in self.rcsfiles:
            try:
                results = results + rf.FilePairsByDate(runclass)
            except SkipRcsException:
                print 'file %s has compressed versions: skipping' % (rf.fname)
            except NotEnoughVersions:
                print 'testing %s on %s: not enough versions' % (runclass, rf.fname)
            else:
                good.append(rf)
            #end
        self.rcsfiles = good
        self.ReportPairs(runclass, results)
        return results
    #end

    def ReportPairs(self, name, results):
        encode_time = 0
        decode_time = 0
        encode_size = 0
        for r in results:
            encode_time += r.encode_time.mean
            decode_time += r.decode_time.mean
            encode_size += r.encode_size
        #end
        print '%s rcs: encode %.2f s: decode %.2f s: size %d' % \
              (name, encode_time, decode_time, encode_size)
    #end

    def MakeBigFiles(self, rand):
        f1 = open(TMPDIR + "/big.1", "w")
        f2 = open(TMPDIR + "/big.2", "w")
        population = []
        for file in self.rcsfiles:
            if len(file.versions) < 2:
                continue
            population.append(file)
        #end
        f1sz = 0
        f2sz = 0
        fcount = int(len(population) * FILE_P)
        assert fcount > 0
        for file in rand.sample(population, fcount):
            m = IGNORE_FILENAME.match(file.fname)
            if m != None:
                continue
            #end
            r1, r2 = rand.sample(xrange(0, len(file.versions)), 2)
            f1sz += file.AppendVersion(f1, r1)
            f2sz += file.AppendVersion(f2, r2)
            #m.update('%s,%s,%s ' % (file.fname[len(RCSDIR):],
            #file.Vstr(r1), file.Vstr(r2)))
        #end
        testkey = 'rcs%d' % self.biground
        self.biground = self.biground + 1

        print '%s; source %u bytes; target %u bytes' % (testkey, f1sz, f2sz)
        f1.close()
        f2.close()
        return (TMPDIR + "/big.1",
                TMPDIR + "/big.2",
                testkey)
    #end

    def Generator(self):
        return lambda rand: self.MakeBigFiles(rand)
    #end
#end

# find a set of RCS files for testing
def GetTestRcsFiles():
    rcsf = RcsFinder()
    rcsf.Crawl(RCSDIR)
    if len(rcsf.rcsfiles) == 0:
        raise CommandError('', 'no RCS files')
    #end
    rcsf.Summarize()
    print "rcsfiles: rcsfiles %d; subdirs %d; others %d; skipped %d" % (
        len(rcsf.rcsfiles),
        len(rcsf.subdirs),
        len(rcsf.others),
        len(rcsf.skipped))
    print StatList([x.rcssize for x in rcsf.rcsfiles], "rcssize").str
    print StatList([x.totrev for x in rcsf.rcsfiles], "totrev").str
    return rcsf
#end

class SampleDataTest:
    def __init__(self, dirs):
        dirs_in = dirs
        self.pairs = []
        while dirs:
            d = dirs[0]
            dirs = dirs[1:]
            l = os.listdir(d)
            files = []
            for e in l:
                p = os.path.join(d, e)
                if os.path.isdir(p):
                    dirs.append(p)
                else:
                    files.append(p)
                #end
            #end
            if len(files) > 1:
                files.sort()
                for x in xrange(len(files)):
                    for y in xrange(len(files)):
                        self.pairs.append((files[x], files[y],
                                           '%s-%s' % (files[x], files[y])))
                    #end
                #end
            #end
        #end
        print "Sample data test using %d file pairs in %s" % (
            len(self.pairs), dirs_in)
    #end

    def Generator(self):
        return lambda rand: rand.choice(self.pairs)
    #end
#end

# configs are represented as a list of values,
# program takes a list of strings:
def ConfigToArgs(config):
    args = [ '-C',
             ','.join([str(x) for x in config[0:SOFT_CONFIG_CNT]])]
    for i in range(SOFT_CONFIG_CNT, len(CONFIG_ORDER)):
        key = CONFIG_ARGMAP[CONFIG_ORDER[i]]
        val = config[i]
        if val == 'true' or val == 'false':
            if val == 'true':
                args.append('%s' % key)
            #end
        else:
            args.append('%s=%s' % (key, val))
        #end
    #end
    return args
#end

#
class RandomTest:
    def __init__(self, tnum, tinput, config, syntuple = None):
        self.mytinput = tinput[2]
        self.myconfig = config
        self.tnum = tnum

        if syntuple != None:
            self.runtime = syntuple[0]
            self.compsize = syntuple[1]
            self.decodetime = None
        else:
            args = ConfigToArgs(config)
            result = TimedTest(tinput[1], tinput[0], Xdelta3Runner(args))

            self.runtime = result.encode_time.mean
            self.compsize = result.encode_size
            self.decodetime = result.decode_time.mean
        #end

        self.score = None
        self.time_pos = None
        self.size_pos = None
        self.score_pos = None
    #end

    def __str__(self):
        decodestr = ' %s' % self.decodetime
        return 'time %.6f%s size %d%s << %s >>%s' % (
            self.time(), ((self.time_pos != None) and
                          (" (%s)" % self.time_pos) or ""),
            self.size(), ((self.size_pos != None) and
                          (" (%s)" % self.size_pos) or ""),
            c2str(self.config()),
            decodestr)
    #end

    def time(self):
        return self.runtime
    #end

    def size(self):
        return self.compsize
    #end

    def config(self):
        return self.myconfig
    #end

    def score(self):
        return self.score
    #end

    def tinput(self):
        return self.mytinput
    #end
#end

def PosInAlist(l, e):
    for i in range(0, len(l)):
        if l[i][1] == e:
            return i;
        #end
    #end
    return -1
#end

# Generates a set of num_results test configurations, given the list of
# retest-configs.
def RandomTestConfigs(rand, input_configs, num_results):

    outputs = input_configs[:]
    have_set = dict([(c,c) for c in input_configs])

    # Compute a random configuration
    def RandomConfig():
        config = []
        cmap = {}
        for key in CONFIG_ORDER:
            val = cmap[key] = (INPUT_SPEC(rand)[key])(cmap)
            config.append(val)
        #end
        return tuple(config)
    #end

    while len(outputs) < num_results:
        newc = None
        for i in xrange(100):
            c = RandomConfig()
            if have_set.has_key(c):
                continue
            #end
            have_set[c] = c
            newc = c
            break
        if newc is None:
            print 'stopped looking for configs at %d' % len(outputs)
            break
        #end
        outputs.append(c)
    #end
    outputs.sort()
    return outputs
#end

def RunOptimizationLoop(rand, generator, rounds):
    configs = []
    for rnum in xrange(rounds):
        configs = RandomTestConfigs(rand, configs, MAX_RESULTS)
        tinput = generator(rand)
        tests = []
        for x in xrange(len(configs)):
            t = RandomTest(x, tinput, configs[x])
            print 'Round %d test %d: %s' % (rnum, x, t)
            tests.append(t)
        #end
        results = ScoreTests(tests)

        for r in results:
            c = r.config()
            if not test_all_config_results.has_key(c):
                test_all_config_results[c] = [r]
            else:
                test_all_config_results[c].append(r)
            #end
        #end

        #GraphResults('expt%d' % rnum, results)
        #GraphSummary('sum%d' % rnum, results)

        # re-test some fraction
        configs = [r.config() for r in results[0:int(MAX_RESULTS * KEEP_P)]]
    #end
#end

# TODO: cleanup
test_all_config_results = {}

def ScoreTests(results):
    scored = []
    timed = []
    sized = []

    t_min = float(min([test.time() for test in results]))
    #t_max = float(max([test.time() for test in results]))
    s_min = float(min([test.size() for test in results]))
    #s_max = float(max([test.size() for test in results]))

    for test in results:

        # Hyperbolic function. Smaller scores still better
        red = 0.999  # minimum factors for each dimension are 1/1000
        test.score = ((test.size() - s_min * red) *
                      (test.time() - t_min * red))

        scored.append((test.score, test))
        timed.append((test.time(), test))
        sized.append((test.size(), test))
    #end

    scored.sort()
    timed.sort()
    sized.sort()

    best_by_size = []
    best_by_time = []

    pos = 0
    for (score, test) in scored:
        pos += 1
        test.score_pos = pos
    #end

    scored = [x[1] for x in scored]

    for test in scored:
        test.size_pos = PosInAlist(sized, test)
        test.time_pos = PosInAlist(timed, test)
    #end

    for test in scored:
        c = test.config()
        s = 0.0
        print 'H-Score: %0.9f %s' % (test.score, test)
    #end

    return scored
#end

def GraphResults(desc, results):
    f = open("data-%s.csv" % desc, "w")
    for r in results:
        f.write("%0.9f\t%d\t# %s\n" % (r.time(), r.size(), r))
    #end
    f.close()
    os.system("./plot.sh data-%s.csv plot-%s.jpg" % (desc, desc))
#end

def GraphSummary(desc, results_ignore):
    test_population = 0
    config_ordered = []

    # drops duplicate test/config pairs (TODO: don't retest them)
    for config, cresults in test_all_config_results.items():
        input_config_map = {}
        uniq = []
        for test in cresults:
            assert test.config() == config
            test_population += 1
            key = test.tinput()
            if not input_config_map.has_key(key):
                input_config_map[key] = {}
            #end
            if input_config_map[key].has_key(config):
                print 'skipping repeat test %s vs. %s' % (input_config_map[key][config], test)
                continue
            #end
            input_config_map[key][config] = test
            uniq.append(test)
        #end
        config_ordered.append(uniq)
    #end

    # sort configs descending by number of tests
    config_ordered.sort(lambda x, y: len(y) - len(x))

    print 'population %d: %d configs %d results' % \
          (test_population,
           len(config_ordered),
           len(config_ordered[0]))

    if config_ordered[0] == 1:
        return
    #end

    # a map from test-key to test-list w/ various configs
    input_set = {}
    osize = len(config_ordered)

    for i in xrange(len(config_ordered)):
        config = config_ordered[i][0].config()
        config_tests = config_ordered[i]

        #print '%s has %d tested inputs' % (config, len(config_tests))

        if len(input_set) == 0:
            input_set = dict([(t.tinput(), [t]) for t in config_tests])
            continue
        #end

        # a map from test-key to test-list w/ various configs
        update_set = {}
        for r in config_tests:
            t = r.tinput()
            if input_set.has_key(t):
                update_set[t] = input_set[t] + [r]
            else:
                #print 'config %s does not have test %s' % (config, t)
                pass
            #end
        #end

        if len(update_set) <= 1:
            break
        #end

        input_set = update_set

        # continue if there are more w/ the same number of inputs
        if i < (len(config_ordered) - 1) and \
           len(config_ordered[i + 1]) == len(config_tests):
            continue
        #end

        # synthesize results for multi-test inputs
        config_num = None

        # map of config to sum(various test-keys)
        smap = {}
        for (key, tests) in input_set.items():
            if config_num == None:
                # config_num should be the same in all elements
                config_num = len(tests)
                smap = dict([(r.config(),
                              (r.time(),
                               r.size()))
                             for r in tests])
            else:
                # compuate the per-config sum of time/size
                assert config_num == len(tests)
                smap = dict([(r.config(),
                              (smap[r.config()][0] + r.time(),
                               smap[r.config()][1] + r.size()))
                             for r in tests])
            #end
        #end

        if config_num == 1:
            continue
        #end

        if len(input_set) == osize:
            break
        #end

        summary = '%s-%d' % (desc, len(input_set))
        osize = len(input_set)

        print 'generate %s w/ %d configs' % (summary, config_num)
        syn = [RandomTest(0, (None, None, summary), config,
                          syntuple = (smap[config][0], smap[config][1]))
               for config in smap.keys()]
        syn = ScoreTests(syn)
        #print 'smap is %s' % (smap,)
        #print 'syn is %s' % (' and '.join([str(x) for x in syn]))
        #GraphResults(summary, syn)
    #end
#end

def RunRegressionTest(pairs, rounds):
    for args in [
        [],
        ['-S=djw'],
        ['-B=412907520'],
        ['-B 412907520', ],

                 ]:
        print "Args %s" % (args)
        for (file1, file2, testkey) in pairs:
            ttest = TimedTest(file1, file2, Xdelta3Runner(args, forkexec=True),
                              skip_trials = 0,
                              min_trials = 1,
                              max_trials = 1)
            print "Source %s\nTarget %s\nEncode %s\nDecode %s\nSize %s\n\n" % (
                file1, file2,
                ttest.encode_time.str,
                ttest.decode_time.str,
                ttest.encode_size)
    #end
#end

if __name__ == "__main__":
    try:
        RunCommand(['rm', '-rf', TMPDIR])
        os.mkdir(TMPDIR)

        #rcsf = GetTestRcsFiles()
        #generator = rcsf.Generator()

        sample = SampleDataTest([SAMPLEDIR])
        generator = sample.Generator()

        rand = random.Random(135135135135135)

        RunRegressionTest(sample.pairs, TEST_ROUNDS)

        #RunSpeedTest()

        # the idea below is to add the default configurations and
        # xdelta1 to the optimization loop:
        #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-3', '-6']))
        #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9']))
        #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-S', 'djw']))
        #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-S', 'djw']))
        #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-T']))
        #x1r = rcsf.AllPairsByDate(Xdelta1RunClass())

    except CommandError:
        pass
    else:
        RunCommand(['rm', '-rf', TMPDIR])
        pass
    #end
#end