1#!/usr/bin/python2.6
2# xdelta 3 - delta compression tools and library
3# Copyright (C) 2003, 2006, 2007, 2008.  Joshua P. MacDonald
4#
5#  This program is free software; you can redistribute it and/or modify
6#  it under the terms of the GNU General Public License as published by
7#  the Free Software Foundation; either version 2 of the License, or
8#  (at your option) any later version.
9#
10#  This program is distributed in the hope that it will be useful,
11#  but WITHOUT ANY WARRANTY; without even the implied warranty of
12#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13#  GNU General Public License for more details.
14#
15#  You should have received a copy of the GNU General Public License
16#  along with this program; if not, write to the Free Software
17#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
19# TODO: test 1.5 vs. greedy
20
21import os, sys, math, re, time, types, array, random
22import xdelta3
23
24#RCSDIR = '/mnt/polaroid/Polaroid/orbit_linux/home/jmacd/PRCS'
25#RCSDIR = '/tmp/PRCS_read_copy'
26#SAMPLEDIR = "/tmp/WESNOTH_tmp/diff"
27
28#RCSDIR = 'G:/jmacd/PRCS_copy'
29#SAMPLEDIR = "C:/sample_data/Wesnoth/tar"
30
31RCSDIR = '/Users/jmacd/src/ftp.kernel.org'
32SAMPLEDIR = '/Users/jmacd/src/xdelta3/linux'
33
34#
35MIN_SIZE       = 0
36
37TIME_TOO_SHORT = 0.050
38
39SKIP_TRIALS    = 2
40MIN_TRIALS     = 3
41MAX_TRIALS     = 15
42
43# 10 = fast 1.5 = slow
44MIN_STDDEV_PCT = 1.5
45
46# How many results per round
47MAX_RESULTS = 500
48TEST_ROUNDS = 10
49KEEP_P = (0.5)
50
51# For RCS testing, what percent to select
52FILE_P = (0.50)
53
54# For run-speed tests
55MIN_RUN = 1000 * 1000 * 1
56MAX_RUN = 1000 * 1000 * 10
57
58# Testwide defaults
59ALL_ARGS = [
60    '-q'  # '-vv'
61    ]
62
63# The first 7 args go to -C
64SOFT_CONFIG_CNT = 7
65
66CONFIG_ORDER = [ 'large_look',
67                 'large_step',
68                 'small_look',
69                 'small_chain',
70                 'small_lchain',
71                 'max_lazy',
72                 'long_enough',
73
74                 # > SOFT_CONFIG_CNT
75                 'nocompress',
76                 'winsize',
77                 'srcwinsize',
78                 'sprevsz',
79                 'iopt',
80                 'djw',
81                 'altcode',
82                 ]
83
84CONFIG_ARGMAP = {
85    'winsize'    : '-W',
86    'srcwinsize' : '-B',
87    'sprevsz'    : '-P',
88    'iopt'       : '-I',
89    'nocompress' : '-N',
90    'djw'        : '-Sdjw',
91    'altcode'    : '-T',
92    }
93
94def INPUT_SPEC(rand):
95    return {
96
97    # Time/space costs:
98
99    # -C 1,2,3,4,5,6,7
100    'large_look' : lambda d: rand.choice([9, 10, 11, 12]),
101    'large_step' : lambda d: rand.choice([25, 26, 27, 28, 29, 30]),
102    'small_look'   : lambda d: rand.choice([4]),
103    'small_chain'  : lambda d: rand.choice([1]),
104    'small_lchain' : lambda d: rand.choice([1]),
105    'max_lazy'     : lambda d: rand.choice([4, 5, 6, 7, 8, 9, 10 ]),
106
107    # Note: long_enough only refers to small matching and has no effect if
108    # small_chain == 1.
109    'long_enough'  : lambda d: rand.choice([4]),
110
111    # -N
112    'nocompress'   : lambda d: rand.choice(['false']),
113
114    # -T
115    'altcode'      : lambda d: rand.choice(['false']),
116
117    # -S djw
118    'djw'          : lambda d: rand.choice(['false']),
119
120    # Memory costs:
121
122    # -W
123    'winsize'      : lambda d: 8 * (1<<20),
124
125    # -B
126    'srcwinsize'   : lambda d: 64 * (1<<20),
127
128    # -I 0 is unlimited
129    'iopt'         : lambda d: 0,
130
131    # -P only powers of two
132    'sprevsz'      : lambda d: rand.choice([x * (1<<16) for x in [4]]),
133  }
134#end
135
136#
137TMPDIR = '/tmp/xd3regtest.%d' % os.getpid()
138
139RUNFILE = os.path.join(TMPDIR, 'run')
140DFILE   = os.path.join(TMPDIR, 'output')
141RFILE   = os.path.join(TMPDIR, 'recon')
142CMPTMP1 = os.path.join(TMPDIR, 'cmptmp1')
143CMPTMP2 = os.path.join(TMPDIR, 'cmptmp2')
144
145HEAD_STATE = 0
146BAR_STATE  = 1
147REV_STATE  = 2
148DATE_STATE = 3
149
150#
151IGNORE_FILENAME  = re.compile('.*\\.(gif|jpg).*')
152
153# rcs output
154RE_TOTREV  = re.compile('total revisions: (\\d+)')
155RE_BAR     = re.compile('----------------------------')
156RE_REV     = re.compile('revision (.+)')
157RE_DATE    = re.compile('date: ([^;]+);.*')
158# xdelta output
159RE_HDRSZ   = re.compile('VCDIFF header size: +(\\d+)')
160RE_EXTCOMP = re.compile('XDELTA ext comp.*')
161
162def c2str(c):
163    return ' '.join(['%s' % x for x in c])
164#end
165
166def SumList(l):
167    return reduce(lambda x,y: x+y, l)
168#end
169
170# returns (total, mean, stddev, q2 (median),
171#          (q3-q1)/2 ("semi-interquartile range"), max-min (spread))
172class StatList:
173    def __init__(self,l,desc):
174        cnt = len(l)
175        assert(cnt > 1)
176        l.sort()
177        self.cnt    = cnt
178        self.l      = l
179        self.total  = SumList(l)
180        self.mean   = self.total / float(self.cnt)
181        self.s      = math.sqrt(SumList([(x-self.mean) *
182                                         (x - self.mean) for x in l]) /
183                                float(self.cnt-1))
184        self.q0     = l[0]
185        self.q1     = l[int(self.cnt/4.0+0.5)]
186        self.q2     = l[int(self.cnt/2.0+0.5)]
187        self.q3     = l[min(self.cnt-1,int((3.0*self.cnt)/4.0+0.5))]
188        self.q4     = l[self.cnt-1]
189        self.siqr   = (self.q3-self.q1)/2.0;
190        self.spread = (self.q4-self.q0)
191        if len(l) == 1:
192            self.str = '%s %s' % (desc, l[0])
193        else:
194            self.str = '%s mean %.1f: 25%-ile %d %d %d %d %d' % \
195                (desc, self.mean, self.q0, self.q1, self.q2, self.q3, self.q4)
196    #end
197#end
198
199def RunCommand(args, ok = [0]):
200    #print 'run command %s' % (' '.join(args))
201    p = os.spawnvp(os.P_WAIT, args[0], args)
202    if p not in ok:
203        raise CommandError(args, 'exited %d' % p)
204    #end
205#end
206
207def RunCommandIO(args,infn,outfn):
208    p = os.fork()
209    if p == 0:
210        os.dup2(os.open(infn,os.O_RDONLY),0)
211        os.dup2(os.open(outfn,os.O_CREAT|os.O_TRUNC|os.O_WRONLY),1)
212        os.execvp(args[0], args)
213    else:
214        s = os.waitpid(p,0)
215        o = os.WEXITSTATUS(s[1])
216        if not os.WIFEXITED(s[1]) or o != 0:
217            raise CommandError(args, 'exited %d' % o)
218        #end
219    #end
220#end
221
222class TimedTest:
223    def __init__(self, target, source, runnable,
224                 skip_trials = SKIP_TRIALS,
225                 min_trials = MIN_TRIALS,
226                 max_trials = MAX_TRIALS,
227                 min_stddev_pct = MIN_STDDEV_PCT):
228        self.target = target
229        self.source = source
230        self.runnable = runnable
231
232        self.skip_trials = skip_trials
233        self.min_trials = min(min_trials, max_trials)
234        self.max_trials = max_trials
235        self.min_stddev_pct = min_stddev_pct
236
237        self.encode_time = self.DoTest(DFILE,
238                                       lambda x: x.Encode(self.target,
239                                                          self.source, DFILE))
240        self.encode_size = runnable.EncodeSize(DFILE)
241
242        self.decode_time = self.DoTest(RFILE,
243                                       lambda x: x.Decode(DFILE,
244                                                          self.source, RFILE),
245                                       )
246        runnable.Verify(self.target, RFILE)
247    #end
248
249    def DoTest(self, fname, func):
250        trials   = 0
251        measured = []
252
253        while 1:
254            try:
255                os.remove(fname)
256            except OSError:
257                pass
258
259            start_time  = time.time()
260            start_clock = time.clock()
261
262            func(self.runnable)
263
264            total_clock = (time.clock() - start_clock)
265            total_time  = (time.time() - start_time)
266
267            elap_time  = max(total_time,  0.0000001)
268            elap_clock = max(total_clock, 0.0000001)
269
270            trials = trials + 1
271
272            # skip some of the first trials
273            if trials > self.skip_trials:
274                measured.append((elap_clock, elap_time))
275                #print 'measurement total: %.1f ms' % (total_time * 1000.0)
276
277            # at least so many
278            if trials < (self.skip_trials + self.min_trials):
279                #print 'continue: need more trials: %d' % trials
280                continue
281
282            # compute %variance
283            done = 0
284            if self.skip_trials + self.min_trials <= 2:
285                measured = measured + measured;
286                done = 1
287            #end
288
289            time_stat = StatList([x[1] for x in measured], 'elap time')
290            sp = float(time_stat.s) / float(time_stat.mean)
291
292            # what if MAX_TRIALS is exceeded?
293            too_many = (trials - self.skip_trials) >= self.max_trials
294            good = (100.0 * sp) < self.min_stddev_pct
295            if done or too_many or good:
296                trials = trials - self.skip_trials
297                if not done and not good:
298                    #print 'too many trials: %d' % trials
299                    pass
300                #clock = StatList([x[0] for x in measured], 'elap clock')
301                return time_stat
302            #end
303        #end
304    #end
305#end
306
307def Decimals(start, end):
308    l = []
309    step = start
310    while 1:
311        r = range(step, step * 10, step)
312        l = l + r
313        if step * 10 >= end:
314            l.append(step * 10)
315            break
316        step = step * 10
317    return l
318#end
319
320# This tests the raw speed of 0-byte inputs
321def RunSpeedTest():
322    for L in Decimals(MIN_RUN, MAX_RUN):
323        SetFileSize(RUNFILE, L)
324
325        trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<20)]))
326        ReportSpeed(L, trx, '1MB ')
327
328        trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<19)]))
329        ReportSpeed(L, trx, '512k')
330
331        trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<18)]))
332        ReportSpeed(L, trx, '256k')
333
334        trm = TimedTest(RUNFILE, None, Xdelta3Mod1(RUNFILE))
335        ReportSpeed(L, trm, 'swig')
336
337        trg = TimedTest(RUNFILE, None, GzipRun1())
338        ReportSpeed(L,trg,'gzip')
339    #end
340#end
341
342def SetFileSize(F,L):
343    fd = os.open(F, os.O_CREAT | os.O_WRONLY)
344    os.ftruncate(fd,L)
345    assert os.fstat(fd).st_size == L
346    os.close(fd)
347#end
348
349def ReportSpeed(L,tr,desc):
350    print '%s run length %u: size %u: time %.3f ms: decode %.3f ms' % \
351          (desc, L,
352           tr.encode_size,
353           tr.encode_time.mean * 1000.0,
354           tr.decode_time.mean * 1000.0)
355#end
356
357class Xdelta3RunClass:
358    def __init__(self, extra):
359        self.extra = extra
360    #end
361
362    def __str__(self):
363        return ' '.join(self.extra)
364    #end
365
366    def New(self):
367        return Xdelta3Runner(self.extra)
368    #end
369#end
370
371class Xdelta3Runner:
372    # Use "forkexec" to get special command-line only features like
373    # external compression support.
374    def __init__(self, extra, forkexec=False):
375        self.forkexec = forkexec
376        self.extra = extra
377    #end
378
379    def Encode(self, target, source, output):
380        args = (ALL_ARGS +
381                self.extra +
382                ['-e'])
383        if source:
384            args.append('-s')
385            args.append(source)
386        #end
387        args = args + [target, output]
388        self.Main(args)
389    #end
390
391    def Decode(self, input, source, output):
392        args = (ALL_ARGS +
393                ['-d'])
394        if source:
395            args.append('-s')
396            args.append(source)
397        #end
398        args = args + [input, output]
399        self.Main(args)
400    #end
401
402    def Verify(self, target, recon):
403        if target[-3:] == ".gz":
404            RunCommandIO(('gzip', '-dc'), target, CMPTMP1)
405            RunCommandIO(('gzip', '-dc'), recon, CMPTMP2)
406            RunCommand(('cmp', CMPTMP1, CMPTMP2))
407        else:
408            RunCommand(('cmp', target, recon))
409    #end
410
411    def EncodeSize(self, output):
412        return os.stat(output).st_size
413    #end
414
415    def Main(self, args):
416        try:
417            if self.forkexec:
418                RunCommand(['../xdelta3'] + args)
419            else:
420                xdelta3.xd3_main_cmdline(args)
421        except Exception, e:
422            raise CommandError(args, "xdelta3.main exception: %s" % e)
423        #end
424    #end
425#end
426
427class Xdelta3Mod1:
428    def __init__(self, file):
429        self.target_data = open(file, 'r').read()
430    #end
431
432    def Encode(self, ignore1, ignore2, ignore3):
433        r1, encoded = xdelta3.xd3_encode_memory(self.target_data, None, 1000000, 1<<10)
434        if r1 != 0:
435            raise CommandError('memory', 'encode failed: %s' % r1)
436        #end
437        self.encoded = encoded
438    #end
439
440    def Decode(self, ignore1, ignore2, ignore3):
441        r2, data1 = xdelta3.xd3_decode_memory(self.encoded, None, len(self.target_data))
442        if r2 != 0:
443            raise CommandError('memory', 'decode failed: %s' % r1)
444        #end
445        self.decoded = data1
446    #end
447
448    def Verify(self, ignore1, ignore2):
449        if self.target_data != self.decoded:
450            raise CommandError('memory', 'bad decode')
451        #end
452    #end
453
454    def EncodeSize(self, ignore1):
455        return len(self.encoded)
456    #end
457#end
458
459class GzipRun1:
460    def Encode(self, target, source, output):
461        assert source == None
462        RunCommandIO(['gzip', '-cf'], target, output)
463    #end
464
465    def Decode(self, input, source, output):
466        assert source == None
467        RunCommandIO(['gzip', '-dcf'], input, output)
468    #end
469
470    def Verify(self, target, recon):
471        RunCommand(('cmp', target, recon))
472    #end
473
474    def EncodeSize(self, output):
475        return os.stat(output).st_size
476    #end
477#end
478
479class Xdelta1RunClass:
480    def __str__(self):
481        return 'xdelta1'
482    #end
483
484    def New(self):
485        return Xdelta1Runner()
486    #end
487#end
488
489class Xdelta1Runner:
490    def Encode(self, target, source, output):
491        assert source != None
492        args = ['xdelta1', 'delta', '-q', source, target, output]
493        RunCommand(args, [0, 1])
494    #end
495
496    def Decode(self, input, source, output):
497        assert source != None
498        args = ['xdelta1', 'patch', '-q', input, source, output]
499        # Note: for dumb historical reasons, xdelta1 returns 1 or 0
500        RunCommand(args)
501    #end
502
503    def Verify(self, target, recon):
504        RunCommand(('cmp', target, recon))
505    #end
506
507    def EncodeSize(self, output):
508        return os.stat(output).st_size
509    #end
510#end
511
512# exceptions
513class SkipRcsException:
514    def __init__(self,reason):
515        self.reason = reason
516    #end
517#end
518
519class NotEnoughVersions:
520    def __init__(self):
521        pass
522    #end
523#end
524
525class CommandError:
526    def __init__(self,cmd,str):
527        if type(cmd) is types.TupleType or \
528           type(cmd) is types.ListType:
529            cmd = reduce(lambda x,y: '%s %s' % (x,y),cmd)
530        #end
531        print 'command was: ',cmd
532        print 'command failed: ',str
533        print 'have fun debugging'
534    #end
535#end
536
537class RcsVersion:
538    def __init__(self,vstr):
539        self.vstr = vstr
540    #end
541    def __cmp__(self,other):
542        return cmp(self.date, other.date)
543    #end
544    def __str__(self):
545        return str(self.vstr)
546    #end
547#end
548
549class RcsFile:
550
551    def __init__(self, fname):
552        self.fname    = fname
553        self.versions = []
554        self.state    = HEAD_STATE
555    #end
556
557    def SetTotRev(self,s):
558        self.totrev = int(s)
559    #end
560
561    def Rev(self,s):
562        self.rev = RcsVersion(s)
563        if len(self.versions) >= self.totrev:
564            raise SkipRcsException('too many versions (in log messages)')
565        #end
566        self.versions.append(self.rev)
567    #end
568
569    def Date(self,s):
570        self.rev.date = s
571    #end
572
573    def Match(self, line, state, rx, gp, newstate, f):
574        if state == self.state:
575            m = rx.match(line)
576            if m:
577                if f:
578                    f(m.group(gp))
579                #end
580                self.state = newstate
581                return 1
582            #end
583        #end
584        return None
585    #end
586
587    def Sum1Rlog(self):
588        f = os.popen('rlog '+self.fname, "r")
589        l = f.readline()
590        while l:
591            if self.Match(l, HEAD_STATE, RE_TOTREV, 1, BAR_STATE, self.SetTotRev):
592                pass
593            elif self.Match(l, BAR_STATE, RE_BAR, 1, REV_STATE, None):
594                pass
595            elif self.Match(l, REV_STATE, RE_REV, 1, DATE_STATE, self.Rev):
596                pass
597            elif self.Match(l, DATE_STATE, RE_DATE, 1, BAR_STATE, self.Date):
598                pass
599            #end
600            l = f.readline()
601        #end
602        c = f.close()
603        if c != None:
604            raise c
605        #end
606    #end
607
608    def Sum1(self):
609        st = os.stat(self.fname)
610        self.rcssize = st.st_size
611        self.Sum1Rlog()
612        if self.totrev != len(self.versions):
613            raise SkipRcsException('wrong version count')
614        #end
615        self.versions.sort()
616    #end
617
618    def Checkout(self,n):
619        v      = self.versions[n]
620        out    = open(self.Verf(n), "w")
621        cmd    = 'co -ko -p%s %s' % (v.vstr, self.fname)
622        total  = 0
623        (inf,
624         stream,
625         err)  = os.popen3(cmd, "r")
626        inf.close()
627        buf    = stream.read()
628        while buf:
629            total = total + len(buf)
630            out.write(buf)
631            buf = stream.read()
632        #end
633        v.vsize = total
634        estr = ''
635        buf = err.read()
636        while buf:
637            estr = estr + buf
638            buf = err.read()
639        #end
640        if stream.close():
641            raise CommandError(cmd, 'checkout failed: %s\n%s\n%s' % (v.vstr, self.fname, estr))
642        #end
643        out.close()
644        err.close()
645    #end
646
647    def Vdate(self,n):
648        return self.versions[n].date
649    #end
650
651    def Vstr(self,n):
652        return self.versions[n].vstr
653    #end
654
655    def Verf(self,n):
656        return os.path.join(TMPDIR, 'input.%d' % n)
657    #end
658
659    def FilePairsByDate(self, runclass):
660        if self.totrev < 2:
661            raise NotEnoughVersions()
662        #end
663        self.Checkout(0)
664        ntrials = []
665        if self.totrev < 2:
666            return vtrials
667        #end
668        for v in range(0,self.totrev-1):
669            if v > 1:
670                os.remove(self.Verf(v-1))
671            #end
672            self.Checkout(v+1)
673            if os.stat(self.Verf(v)).st_size < MIN_SIZE or \
674               os.stat(self.Verf(v+1)).st_size < MIN_SIZE:
675                continue
676            #end
677
678            result = TimedTest(self.Verf(v+1),
679                               self.Verf(v),
680                               runclass.New())
681
682            target_size = os.stat(self.Verf(v+1)).st_size
683
684            ntrials.append(result)
685        #end
686
687        os.remove(self.Verf(self.totrev-1))
688        os.remove(self.Verf(self.totrev-2))
689        return ntrials
690    #end
691
692    def AppendVersion(self, f, n):
693        self.Checkout(n)
694        rf = open(self.Verf(n), "r")
695        data = rf.read()
696        f.write(data)
697        rf.close()
698        return len(data)
699    #end
700
701class RcsFinder:
702    def __init__(self):
703        self.subdirs  = []
704        self.rcsfiles = []
705        self.others   = []
706        self.skipped  = []
707        self.biground = 0
708    #end
709
710    def Scan1(self,dir):
711        dents = os.listdir(dir)
712        subdirs  = []
713        rcsfiles = []
714        others   = []
715        for dent in dents:
716            full = os.path.join(dir, dent)
717            if os.path.isdir(full):
718                subdirs.append(full)
719            elif dent[len(dent)-2:] == ",v":
720                rcsfiles.append(RcsFile(full))
721            else:
722                others.append(full)
723            #end
724        #end
725        self.subdirs  = self.subdirs  + subdirs
726        self.rcsfiles = self.rcsfiles + rcsfiles
727        self.others   = self.others   + others
728        return subdirs
729    #end
730
731    def Crawl(self, dir):
732        subdirs = [dir]
733        while subdirs:
734            s1 = self.Scan1(subdirs[0])
735            subdirs = subdirs[1:] + s1
736        #end
737    #end
738
739    def Summarize(self):
740        good = []
741        for rf in self.rcsfiles:
742            try:
743                rf.Sum1()
744                if rf.totrev < 2:
745                    raise SkipRcsException('too few versions (< 2)')
746                #end
747            except SkipRcsException, e:
748                #print 'skipping file %s: %s' % (rf.fname, e.reason)
749                self.skipped.append(rf)
750            else:
751                good.append(rf)
752            #end
753        self.rcsfiles = good
754    #end
755
756    def AllPairsByDate(self, runclass):
757        results = []
758        good = []
759        for rf in self.rcsfiles:
760            try:
761                results = results + rf.FilePairsByDate(runclass)
762            except SkipRcsException:
763                print 'file %s has compressed versions: skipping' % (rf.fname)
764            except NotEnoughVersions:
765                print 'testing %s on %s: not enough versions' % (runclass, rf.fname)
766            else:
767                good.append(rf)
768            #end
769        self.rcsfiles = good
770        self.ReportPairs(runclass, results)
771        return results
772    #end
773
774    def ReportPairs(self, name, results):
775        encode_time = 0
776        decode_time = 0
777        encode_size = 0
778        for r in results:
779            encode_time += r.encode_time.mean
780            decode_time += r.decode_time.mean
781            encode_size += r.encode_size
782        #end
783        print '%s rcs: encode %.2f s: decode %.2f s: size %d' % \
784              (name, encode_time, decode_time, encode_size)
785    #end
786
787    def MakeBigFiles(self, rand):
788        f1 = open(TMPDIR + "/big.1", "w")
789        f2 = open(TMPDIR + "/big.2", "w")
790        population = []
791        for file in self.rcsfiles:
792            if len(file.versions) < 2:
793                continue
794            population.append(file)
795        #end
796        f1sz = 0
797        f2sz = 0
798        fcount = int(len(population) * FILE_P)
799        assert fcount > 0
800        for file in rand.sample(population, fcount):
801            m = IGNORE_FILENAME.match(file.fname)
802            if m != None:
803                continue
804            #end
805            r1, r2 = rand.sample(xrange(0, len(file.versions)), 2)
806            f1sz += file.AppendVersion(f1, r1)
807            f2sz += file.AppendVersion(f2, r2)
808            #m.update('%s,%s,%s ' % (file.fname[len(RCSDIR):],
809            #file.Vstr(r1), file.Vstr(r2)))
810        #end
811        testkey = 'rcs%d' % self.biground
812        self.biground = self.biground + 1
813
814        print '%s; source %u bytes; target %u bytes' % (testkey, f1sz, f2sz)
815        f1.close()
816        f2.close()
817        return (TMPDIR + "/big.1",
818                TMPDIR + "/big.2",
819                testkey)
820    #end
821
822    def Generator(self):
823        return lambda rand: self.MakeBigFiles(rand)
824    #end
825#end
826
827# find a set of RCS files for testing
828def GetTestRcsFiles():
829    rcsf = RcsFinder()
830    rcsf.Crawl(RCSDIR)
831    if len(rcsf.rcsfiles) == 0:
832        raise CommandError('', 'no RCS files')
833    #end
834    rcsf.Summarize()
835    print "rcsfiles: rcsfiles %d; subdirs %d; others %d; skipped %d" % (
836        len(rcsf.rcsfiles),
837        len(rcsf.subdirs),
838        len(rcsf.others),
839        len(rcsf.skipped))
840    print StatList([x.rcssize for x in rcsf.rcsfiles], "rcssize").str
841    print StatList([x.totrev for x in rcsf.rcsfiles], "totrev").str
842    return rcsf
843#end
844
845class SampleDataTest:
846    def __init__(self, dirs):
847        dirs_in = dirs
848        self.pairs = []
849        while dirs:
850            d = dirs[0]
851            dirs = dirs[1:]
852            l = os.listdir(d)
853            files = []
854            for e in l:
855                p = os.path.join(d, e)
856                if os.path.isdir(p):
857                    dirs.append(p)
858                else:
859                    files.append(p)
860                #end
861            #end
862            if len(files) > 1:
863                files.sort()
864                for x in xrange(len(files)):
865                    for y in xrange(len(files)):
866                        self.pairs.append((files[x], files[y],
867                                           '%s-%s' % (files[x], files[y])))
868                    #end
869                #end
870            #end
871        #end
872        print "Sample data test using %d file pairs in %s" % (
873            len(self.pairs), dirs_in)
874    #end
875
876    def Generator(self):
877        return lambda rand: rand.choice(self.pairs)
878    #end
879#end
880
881# configs are represented as a list of values,
882# program takes a list of strings:
883def ConfigToArgs(config):
884    args = [ '-C',
885             ','.join([str(x) for x in config[0:SOFT_CONFIG_CNT]])]
886    for i in range(SOFT_CONFIG_CNT, len(CONFIG_ORDER)):
887        key = CONFIG_ARGMAP[CONFIG_ORDER[i]]
888        val = config[i]
889        if val == 'true' or val == 'false':
890            if val == 'true':
891                args.append('%s' % key)
892            #end
893        else:
894            args.append('%s=%s' % (key, val))
895        #end
896    #end
897    return args
898#end
899
900#
901class RandomTest:
902    def __init__(self, tnum, tinput, config, syntuple = None):
903        self.mytinput = tinput[2]
904        self.myconfig = config
905        self.tnum = tnum
906
907        if syntuple != None:
908            self.runtime = syntuple[0]
909            self.compsize = syntuple[1]
910            self.decodetime = None
911        else:
912            args = ConfigToArgs(config)
913            result = TimedTest(tinput[1], tinput[0], Xdelta3Runner(args))
914
915            self.runtime = result.encode_time.mean
916            self.compsize = result.encode_size
917            self.decodetime = result.decode_time.mean
918        #end
919
920        self.score = None
921        self.time_pos = None
922        self.size_pos = None
923        self.score_pos = None
924    #end
925
926    def __str__(self):
927        decodestr = ' %s' % self.decodetime
928        return 'time %.6f%s size %d%s << %s >>%s' % (
929            self.time(), ((self.time_pos != None) and
930                          (" (%s)" % self.time_pos) or ""),
931            self.size(), ((self.size_pos != None) and
932                          (" (%s)" % self.size_pos) or ""),
933            c2str(self.config()),
934            decodestr)
935    #end
936
937    def time(self):
938        return self.runtime
939    #end
940
941    def size(self):
942        return self.compsize
943    #end
944
945    def config(self):
946        return self.myconfig
947    #end
948
949    def score(self):
950        return self.score
951    #end
952
953    def tinput(self):
954        return self.mytinput
955    #end
956#end
957
958def PosInAlist(l, e):
959    for i in range(0, len(l)):
960        if l[i][1] == e:
961            return i;
962        #end
963    #end
964    return -1
965#end
966
967# Generates a set of num_results test configurations, given the list of
968# retest-configs.
969def RandomTestConfigs(rand, input_configs, num_results):
970
971    outputs = input_configs[:]
972    have_set = dict([(c,c) for c in input_configs])
973
974    # Compute a random configuration
975    def RandomConfig():
976        config = []
977        cmap = {}
978        for key in CONFIG_ORDER:
979            val = cmap[key] = (INPUT_SPEC(rand)[key])(cmap)
980            config.append(val)
981        #end
982        return tuple(config)
983    #end
984
985    while len(outputs) < num_results:
986        newc = None
987        for i in xrange(100):
988            c = RandomConfig()
989            if have_set.has_key(c):
990                continue
991            #end
992            have_set[c] = c
993            newc = c
994            break
995        if newc is None:
996            print 'stopped looking for configs at %d' % len(outputs)
997            break
998        #end
999        outputs.append(c)
1000    #end
1001    outputs.sort()
1002    return outputs
1003#end
1004
1005def RunOptimizationLoop(rand, generator, rounds):
1006    configs = []
1007    for rnum in xrange(rounds):
1008        configs = RandomTestConfigs(rand, configs, MAX_RESULTS)
1009        tinput = generator(rand)
1010        tests = []
1011        for x in xrange(len(configs)):
1012            t = RandomTest(x, tinput, configs[x])
1013            print 'Round %d test %d: %s' % (rnum, x, t)
1014            tests.append(t)
1015        #end
1016        results = ScoreTests(tests)
1017
1018        for r in results:
1019            c = r.config()
1020            if not test_all_config_results.has_key(c):
1021                test_all_config_results[c] = [r]
1022            else:
1023                test_all_config_results[c].append(r)
1024            #end
1025        #end
1026
1027        #GraphResults('expt%d' % rnum, results)
1028        #GraphSummary('sum%d' % rnum, results)
1029
1030        # re-test some fraction
1031        configs = [r.config() for r in results[0:int(MAX_RESULTS * KEEP_P)]]
1032    #end
1033#end
1034
1035# TODO: cleanup
1036test_all_config_results = {}
1037
1038def ScoreTests(results):
1039    scored = []
1040    timed = []
1041    sized = []
1042
1043    t_min = float(min([test.time() for test in results]))
1044    #t_max = float(max([test.time() for test in results]))
1045    s_min = float(min([test.size() for test in results]))
1046    #s_max = float(max([test.size() for test in results]))
1047
1048    for test in results:
1049
1050        # Hyperbolic function. Smaller scores still better
1051        red = 0.999  # minimum factors for each dimension are 1/1000
1052        test.score = ((test.size() - s_min * red) *
1053                      (test.time() - t_min * red))
1054
1055        scored.append((test.score, test))
1056        timed.append((test.time(), test))
1057        sized.append((test.size(), test))
1058    #end
1059
1060    scored.sort()
1061    timed.sort()
1062    sized.sort()
1063
1064    best_by_size = []
1065    best_by_time = []
1066
1067    pos = 0
1068    for (score, test) in scored:
1069        pos += 1
1070        test.score_pos = pos
1071    #end
1072
1073    scored = [x[1] for x in scored]
1074
1075    for test in scored:
1076        test.size_pos = PosInAlist(sized, test)
1077        test.time_pos = PosInAlist(timed, test)
1078    #end
1079
1080    for test in scored:
1081        c = test.config()
1082        s = 0.0
1083        print 'H-Score: %0.9f %s' % (test.score, test)
1084    #end
1085
1086    return scored
1087#end
1088
1089def GraphResults(desc, results):
1090    f = open("data-%s.csv" % desc, "w")
1091    for r in results:
1092        f.write("%0.9f\t%d\t# %s\n" % (r.time(), r.size(), r))
1093    #end
1094    f.close()
1095    os.system("./plot.sh data-%s.csv plot-%s.jpg" % (desc, desc))
1096#end
1097
1098def GraphSummary(desc, results_ignore):
1099    test_population = 0
1100    config_ordered = []
1101
1102    # drops duplicate test/config pairs (TODO: don't retest them)
1103    for config, cresults in test_all_config_results.items():
1104        input_config_map = {}
1105        uniq = []
1106        for test in cresults:
1107            assert test.config() == config
1108            test_population += 1
1109            key = test.tinput()
1110            if not input_config_map.has_key(key):
1111                input_config_map[key] = {}
1112            #end
1113            if input_config_map[key].has_key(config):
1114                print 'skipping repeat test %s vs. %s' % (input_config_map[key][config], test)
1115                continue
1116            #end
1117            input_config_map[key][config] = test
1118            uniq.append(test)
1119        #end
1120        config_ordered.append(uniq)
1121    #end
1122
1123    # sort configs descending by number of tests
1124    config_ordered.sort(lambda x, y: len(y) - len(x))
1125
1126    print 'population %d: %d configs %d results' % \
1127          (test_population,
1128           len(config_ordered),
1129           len(config_ordered[0]))
1130
1131    if config_ordered[0] == 1:
1132        return
1133    #end
1134
1135    # a map from test-key to test-list w/ various configs
1136    input_set = {}
1137    osize = len(config_ordered)
1138
1139    for i in xrange(len(config_ordered)):
1140        config = config_ordered[i][0].config()
1141        config_tests = config_ordered[i]
1142
1143        #print '%s has %d tested inputs' % (config, len(config_tests))
1144
1145        if len(input_set) == 0:
1146            input_set = dict([(t.tinput(), [t]) for t in config_tests])
1147            continue
1148        #end
1149
1150        # a map from test-key to test-list w/ various configs
1151        update_set = {}
1152        for r in config_tests:
1153            t = r.tinput()
1154            if input_set.has_key(t):
1155                update_set[t] = input_set[t] + [r]
1156            else:
1157                #print 'config %s does not have test %s' % (config, t)
1158                pass
1159            #end
1160        #end
1161
1162        if len(update_set) <= 1:
1163            break
1164        #end
1165
1166        input_set = update_set
1167
1168        # continue if there are more w/ the same number of inputs
1169        if i < (len(config_ordered) - 1) and \
1170           len(config_ordered[i + 1]) == len(config_tests):
1171            continue
1172        #end
1173
1174        # synthesize results for multi-test inputs
1175        config_num = None
1176
1177        # map of config to sum(various test-keys)
1178        smap = {}
1179        for (key, tests) in input_set.items():
1180            if config_num == None:
1181                # config_num should be the same in all elements
1182                config_num = len(tests)
1183                smap = dict([(r.config(),
1184                              (r.time(),
1185                               r.size()))
1186                             for r in tests])
1187            else:
1188                # compuate the per-config sum of time/size
1189                assert config_num == len(tests)
1190                smap = dict([(r.config(),
1191                              (smap[r.config()][0] + r.time(),
1192                               smap[r.config()][1] + r.size()))
1193                             for r in tests])
1194            #end
1195        #end
1196
1197        if config_num == 1:
1198            continue
1199        #end
1200
1201        if len(input_set) == osize:
1202            break
1203        #end
1204
1205        summary = '%s-%d' % (desc, len(input_set))
1206        osize = len(input_set)
1207
1208        print 'generate %s w/ %d configs' % (summary, config_num)
1209        syn = [RandomTest(0, (None, None, summary), config,
1210                          syntuple = (smap[config][0], smap[config][1]))
1211               for config in smap.keys()]
1212        syn = ScoreTests(syn)
1213        #print 'smap is %s' % (smap,)
1214        #print 'syn is %s' % (' and '.join([str(x) for x in syn]))
1215        #GraphResults(summary, syn)
1216    #end
1217#end
1218
1219def RunRegressionTest(pairs, rounds):
1220    for args in [
1221        [],
1222        ['-S=djw'],
1223        ['-B=412907520'],
1224        ['-B 412907520', ],
1225
1226                 ]:
1227        print "Args %s" % (args)
1228        for (file1, file2, testkey) in pairs:
1229            ttest = TimedTest(file1, file2, Xdelta3Runner(args, forkexec=True),
1230                              skip_trials = 0,
1231                              min_trials = 1,
1232                              max_trials = 1)
1233            print "Source %s\nTarget %s\nEncode %s\nDecode %s\nSize %s\n\n" % (
1234                file1, file2,
1235                ttest.encode_time.str,
1236                ttest.decode_time.str,
1237                ttest.encode_size)
1238    #end
1239#end
1240
1241if __name__ == "__main__":
1242    try:
1243        RunCommand(['rm', '-rf', TMPDIR])
1244        os.mkdir(TMPDIR)
1245
1246        #rcsf = GetTestRcsFiles()
1247        #generator = rcsf.Generator()
1248
1249        sample = SampleDataTest([SAMPLEDIR])
1250        generator = sample.Generator()
1251
1252        rand = random.Random(135135135135135)
1253
1254        RunRegressionTest(sample.pairs, TEST_ROUNDS)
1255
1256        #RunSpeedTest()
1257
1258        # the idea below is to add the default configurations and
1259        # xdelta1 to the optimization loop:
1260        #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-3', '-6']))
1261        #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9']))
1262        #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-S', 'djw']))
1263        #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-S', 'djw']))
1264        #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-T']))
1265        #x1r = rcsf.AllPairsByDate(Xdelta1RunClass())
1266
1267    except CommandError:
1268        pass
1269    else:
1270        RunCommand(['rm', '-rf', TMPDIR])
1271        pass
1272    #end
1273#end
1274