1#!/usr/bin/python2.6 2# xdelta 3 - delta compression tools and library 3# Copyright (C) 2003, 2006, 2007, 2008. Joshua P. MacDonald 4# 5# This program is free software; you can redistribute it and/or modify 6# it under the terms of the GNU General Public License as published by 7# the Free Software Foundation; either version 2 of the License, or 8# (at your option) any later version. 9# 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14# 15# You should have received a copy of the GNU General Public License 16# along with this program; if not, write to the Free Software 17# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 19# TODO: test 1.5 vs. greedy 20 21import os, sys, math, re, time, types, array, random 22import xdelta3 23 24#RCSDIR = '/mnt/polaroid/Polaroid/orbit_linux/home/jmacd/PRCS' 25#RCSDIR = '/tmp/PRCS_read_copy' 26#SAMPLEDIR = "/tmp/WESNOTH_tmp/diff" 27 28#RCSDIR = 'G:/jmacd/PRCS_copy' 29#SAMPLEDIR = "C:/sample_data/Wesnoth/tar" 30 31RCSDIR = '/Users/jmacd/src/ftp.kernel.org' 32SAMPLEDIR = '/Users/jmacd/src/xdelta3/linux' 33 34# 35MIN_SIZE = 0 36 37TIME_TOO_SHORT = 0.050 38 39SKIP_TRIALS = 2 40MIN_TRIALS = 3 41MAX_TRIALS = 15 42 43# 10 = fast 1.5 = slow 44MIN_STDDEV_PCT = 1.5 45 46# How many results per round 47MAX_RESULTS = 500 48TEST_ROUNDS = 10 49KEEP_P = (0.5) 50 51# For RCS testing, what percent to select 52FILE_P = (0.50) 53 54# For run-speed tests 55MIN_RUN = 1000 * 1000 * 1 56MAX_RUN = 1000 * 1000 * 10 57 58# Testwide defaults 59ALL_ARGS = [ 60 '-q' # '-vv' 61 ] 62 63# The first 7 args go to -C 64SOFT_CONFIG_CNT = 7 65 66CONFIG_ORDER = [ 'large_look', 67 'large_step', 68 'small_look', 69 'small_chain', 70 'small_lchain', 71 'max_lazy', 72 'long_enough', 73 74 # > SOFT_CONFIG_CNT 75 'nocompress', 76 'winsize', 77 'srcwinsize', 78 'sprevsz', 79 'iopt', 80 'djw', 81 'altcode', 82 ] 83 84CONFIG_ARGMAP = { 85 'winsize' : '-W', 86 'srcwinsize' : '-B', 87 'sprevsz' : '-P', 88 'iopt' : '-I', 89 'nocompress' : '-N', 90 'djw' : '-Sdjw', 91 'altcode' : '-T', 92 } 93 94def INPUT_SPEC(rand): 95 return { 96 97 # Time/space costs: 98 99 # -C 1,2,3,4,5,6,7 100 'large_look' : lambda d: rand.choice([9, 10, 11, 12]), 101 'large_step' : lambda d: rand.choice([25, 26, 27, 28, 29, 30]), 102 'small_look' : lambda d: rand.choice([4]), 103 'small_chain' : lambda d: rand.choice([1]), 104 'small_lchain' : lambda d: rand.choice([1]), 105 'max_lazy' : lambda d: rand.choice([4, 5, 6, 7, 8, 9, 10 ]), 106 107 # Note: long_enough only refers to small matching and has no effect if 108 # small_chain == 1. 109 'long_enough' : lambda d: rand.choice([4]), 110 111 # -N 112 'nocompress' : lambda d: rand.choice(['false']), 113 114 # -T 115 'altcode' : lambda d: rand.choice(['false']), 116 117 # -S djw 118 'djw' : lambda d: rand.choice(['false']), 119 120 # Memory costs: 121 122 # -W 123 'winsize' : lambda d: 8 * (1<<20), 124 125 # -B 126 'srcwinsize' : lambda d: 64 * (1<<20), 127 128 # -I 0 is unlimited 129 'iopt' : lambda d: 0, 130 131 # -P only powers of two 132 'sprevsz' : lambda d: rand.choice([x * (1<<16) for x in [4]]), 133 } 134#end 135 136# 137TMPDIR = '/tmp/xd3regtest.%d' % os.getpid() 138 139RUNFILE = os.path.join(TMPDIR, 'run') 140DFILE = os.path.join(TMPDIR, 'output') 141RFILE = os.path.join(TMPDIR, 'recon') 142CMPTMP1 = os.path.join(TMPDIR, 'cmptmp1') 143CMPTMP2 = os.path.join(TMPDIR, 'cmptmp2') 144 145HEAD_STATE = 0 146BAR_STATE = 1 147REV_STATE = 2 148DATE_STATE = 3 149 150# 151IGNORE_FILENAME = re.compile('.*\\.(gif|jpg).*') 152 153# rcs output 154RE_TOTREV = re.compile('total revisions: (\\d+)') 155RE_BAR = re.compile('----------------------------') 156RE_REV = re.compile('revision (.+)') 157RE_DATE = re.compile('date: ([^;]+);.*') 158# xdelta output 159RE_HDRSZ = re.compile('VCDIFF header size: +(\\d+)') 160RE_EXTCOMP = re.compile('XDELTA ext comp.*') 161 162def c2str(c): 163 return ' '.join(['%s' % x for x in c]) 164#end 165 166def SumList(l): 167 return reduce(lambda x,y: x+y, l) 168#end 169 170# returns (total, mean, stddev, q2 (median), 171# (q3-q1)/2 ("semi-interquartile range"), max-min (spread)) 172class StatList: 173 def __init__(self,l,desc): 174 cnt = len(l) 175 assert(cnt > 1) 176 l.sort() 177 self.cnt = cnt 178 self.l = l 179 self.total = SumList(l) 180 self.mean = self.total / float(self.cnt) 181 self.s = math.sqrt(SumList([(x-self.mean) * 182 (x - self.mean) for x in l]) / 183 float(self.cnt-1)) 184 self.q0 = l[0] 185 self.q1 = l[int(self.cnt/4.0+0.5)] 186 self.q2 = l[int(self.cnt/2.0+0.5)] 187 self.q3 = l[min(self.cnt-1,int((3.0*self.cnt)/4.0+0.5))] 188 self.q4 = l[self.cnt-1] 189 self.siqr = (self.q3-self.q1)/2.0; 190 self.spread = (self.q4-self.q0) 191 if len(l) == 1: 192 self.str = '%s %s' % (desc, l[0]) 193 else: 194 self.str = '%s mean %.1f: 25%-ile %d %d %d %d %d' % \ 195 (desc, self.mean, self.q0, self.q1, self.q2, self.q3, self.q4) 196 #end 197#end 198 199def RunCommand(args, ok = [0]): 200 #print 'run command %s' % (' '.join(args)) 201 p = os.spawnvp(os.P_WAIT, args[0], args) 202 if p not in ok: 203 raise CommandError(args, 'exited %d' % p) 204 #end 205#end 206 207def RunCommandIO(args,infn,outfn): 208 p = os.fork() 209 if p == 0: 210 os.dup2(os.open(infn,os.O_RDONLY),0) 211 os.dup2(os.open(outfn,os.O_CREAT|os.O_TRUNC|os.O_WRONLY),1) 212 os.execvp(args[0], args) 213 else: 214 s = os.waitpid(p,0) 215 o = os.WEXITSTATUS(s[1]) 216 if not os.WIFEXITED(s[1]) or o != 0: 217 raise CommandError(args, 'exited %d' % o) 218 #end 219 #end 220#end 221 222class TimedTest: 223 def __init__(self, target, source, runnable, 224 skip_trials = SKIP_TRIALS, 225 min_trials = MIN_TRIALS, 226 max_trials = MAX_TRIALS, 227 min_stddev_pct = MIN_STDDEV_PCT): 228 self.target = target 229 self.source = source 230 self.runnable = runnable 231 232 self.skip_trials = skip_trials 233 self.min_trials = min(min_trials, max_trials) 234 self.max_trials = max_trials 235 self.min_stddev_pct = min_stddev_pct 236 237 self.encode_time = self.DoTest(DFILE, 238 lambda x: x.Encode(self.target, 239 self.source, DFILE)) 240 self.encode_size = runnable.EncodeSize(DFILE) 241 242 self.decode_time = self.DoTest(RFILE, 243 lambda x: x.Decode(DFILE, 244 self.source, RFILE), 245 ) 246 runnable.Verify(self.target, RFILE) 247 #end 248 249 def DoTest(self, fname, func): 250 trials = 0 251 measured = [] 252 253 while 1: 254 try: 255 os.remove(fname) 256 except OSError: 257 pass 258 259 start_time = time.time() 260 start_clock = time.clock() 261 262 func(self.runnable) 263 264 total_clock = (time.clock() - start_clock) 265 total_time = (time.time() - start_time) 266 267 elap_time = max(total_time, 0.0000001) 268 elap_clock = max(total_clock, 0.0000001) 269 270 trials = trials + 1 271 272 # skip some of the first trials 273 if trials > self.skip_trials: 274 measured.append((elap_clock, elap_time)) 275 #print 'measurement total: %.1f ms' % (total_time * 1000.0) 276 277 # at least so many 278 if trials < (self.skip_trials + self.min_trials): 279 #print 'continue: need more trials: %d' % trials 280 continue 281 282 # compute %variance 283 done = 0 284 if self.skip_trials + self.min_trials <= 2: 285 measured = measured + measured; 286 done = 1 287 #end 288 289 time_stat = StatList([x[1] for x in measured], 'elap time') 290 sp = float(time_stat.s) / float(time_stat.mean) 291 292 # what if MAX_TRIALS is exceeded? 293 too_many = (trials - self.skip_trials) >= self.max_trials 294 good = (100.0 * sp) < self.min_stddev_pct 295 if done or too_many or good: 296 trials = trials - self.skip_trials 297 if not done and not good: 298 #print 'too many trials: %d' % trials 299 pass 300 #clock = StatList([x[0] for x in measured], 'elap clock') 301 return time_stat 302 #end 303 #end 304 #end 305#end 306 307def Decimals(start, end): 308 l = [] 309 step = start 310 while 1: 311 r = range(step, step * 10, step) 312 l = l + r 313 if step * 10 >= end: 314 l.append(step * 10) 315 break 316 step = step * 10 317 return l 318#end 319 320# This tests the raw speed of 0-byte inputs 321def RunSpeedTest(): 322 for L in Decimals(MIN_RUN, MAX_RUN): 323 SetFileSize(RUNFILE, L) 324 325 trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<20)])) 326 ReportSpeed(L, trx, '1MB ') 327 328 trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<19)])) 329 ReportSpeed(L, trx, '512k') 330 331 trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<18)])) 332 ReportSpeed(L, trx, '256k') 333 334 trm = TimedTest(RUNFILE, None, Xdelta3Mod1(RUNFILE)) 335 ReportSpeed(L, trm, 'swig') 336 337 trg = TimedTest(RUNFILE, None, GzipRun1()) 338 ReportSpeed(L,trg,'gzip') 339 #end 340#end 341 342def SetFileSize(F,L): 343 fd = os.open(F, os.O_CREAT | os.O_WRONLY) 344 os.ftruncate(fd,L) 345 assert os.fstat(fd).st_size == L 346 os.close(fd) 347#end 348 349def ReportSpeed(L,tr,desc): 350 print '%s run length %u: size %u: time %.3f ms: decode %.3f ms' % \ 351 (desc, L, 352 tr.encode_size, 353 tr.encode_time.mean * 1000.0, 354 tr.decode_time.mean * 1000.0) 355#end 356 357class Xdelta3RunClass: 358 def __init__(self, extra): 359 self.extra = extra 360 #end 361 362 def __str__(self): 363 return ' '.join(self.extra) 364 #end 365 366 def New(self): 367 return Xdelta3Runner(self.extra) 368 #end 369#end 370 371class Xdelta3Runner: 372 # Use "forkexec" to get special command-line only features like 373 # external compression support. 374 def __init__(self, extra, forkexec=False): 375 self.forkexec = forkexec 376 self.extra = extra 377 #end 378 379 def Encode(self, target, source, output): 380 args = (ALL_ARGS + 381 self.extra + 382 ['-e']) 383 if source: 384 args.append('-s') 385 args.append(source) 386 #end 387 args = args + [target, output] 388 self.Main(args) 389 #end 390 391 def Decode(self, input, source, output): 392 args = (ALL_ARGS + 393 ['-d']) 394 if source: 395 args.append('-s') 396 args.append(source) 397 #end 398 args = args + [input, output] 399 self.Main(args) 400 #end 401 402 def Verify(self, target, recon): 403 if target[-3:] == ".gz": 404 RunCommandIO(('gzip', '-dc'), target, CMPTMP1) 405 RunCommandIO(('gzip', '-dc'), recon, CMPTMP2) 406 RunCommand(('cmp', CMPTMP1, CMPTMP2)) 407 else: 408 RunCommand(('cmp', target, recon)) 409 #end 410 411 def EncodeSize(self, output): 412 return os.stat(output).st_size 413 #end 414 415 def Main(self, args): 416 try: 417 if self.forkexec: 418 RunCommand(['../xdelta3'] + args) 419 else: 420 xdelta3.xd3_main_cmdline(args) 421 except Exception, e: 422 raise CommandError(args, "xdelta3.main exception: %s" % e) 423 #end 424 #end 425#end 426 427class Xdelta3Mod1: 428 def __init__(self, file): 429 self.target_data = open(file, 'r').read() 430 #end 431 432 def Encode(self, ignore1, ignore2, ignore3): 433 r1, encoded = xdelta3.xd3_encode_memory(self.target_data, None, 1000000, 1<<10) 434 if r1 != 0: 435 raise CommandError('memory', 'encode failed: %s' % r1) 436 #end 437 self.encoded = encoded 438 #end 439 440 def Decode(self, ignore1, ignore2, ignore3): 441 r2, data1 = xdelta3.xd3_decode_memory(self.encoded, None, len(self.target_data)) 442 if r2 != 0: 443 raise CommandError('memory', 'decode failed: %s' % r1) 444 #end 445 self.decoded = data1 446 #end 447 448 def Verify(self, ignore1, ignore2): 449 if self.target_data != self.decoded: 450 raise CommandError('memory', 'bad decode') 451 #end 452 #end 453 454 def EncodeSize(self, ignore1): 455 return len(self.encoded) 456 #end 457#end 458 459class GzipRun1: 460 def Encode(self, target, source, output): 461 assert source == None 462 RunCommandIO(['gzip', '-cf'], target, output) 463 #end 464 465 def Decode(self, input, source, output): 466 assert source == None 467 RunCommandIO(['gzip', '-dcf'], input, output) 468 #end 469 470 def Verify(self, target, recon): 471 RunCommand(('cmp', target, recon)) 472 #end 473 474 def EncodeSize(self, output): 475 return os.stat(output).st_size 476 #end 477#end 478 479class Xdelta1RunClass: 480 def __str__(self): 481 return 'xdelta1' 482 #end 483 484 def New(self): 485 return Xdelta1Runner() 486 #end 487#end 488 489class Xdelta1Runner: 490 def Encode(self, target, source, output): 491 assert source != None 492 args = ['xdelta1', 'delta', '-q', source, target, output] 493 RunCommand(args, [0, 1]) 494 #end 495 496 def Decode(self, input, source, output): 497 assert source != None 498 args = ['xdelta1', 'patch', '-q', input, source, output] 499 # Note: for dumb historical reasons, xdelta1 returns 1 or 0 500 RunCommand(args) 501 #end 502 503 def Verify(self, target, recon): 504 RunCommand(('cmp', target, recon)) 505 #end 506 507 def EncodeSize(self, output): 508 return os.stat(output).st_size 509 #end 510#end 511 512# exceptions 513class SkipRcsException: 514 def __init__(self,reason): 515 self.reason = reason 516 #end 517#end 518 519class NotEnoughVersions: 520 def __init__(self): 521 pass 522 #end 523#end 524 525class CommandError: 526 def __init__(self,cmd,str): 527 if type(cmd) is types.TupleType or \ 528 type(cmd) is types.ListType: 529 cmd = reduce(lambda x,y: '%s %s' % (x,y),cmd) 530 #end 531 print 'command was: ',cmd 532 print 'command failed: ',str 533 print 'have fun debugging' 534 #end 535#end 536 537class RcsVersion: 538 def __init__(self,vstr): 539 self.vstr = vstr 540 #end 541 def __cmp__(self,other): 542 return cmp(self.date, other.date) 543 #end 544 def __str__(self): 545 return str(self.vstr) 546 #end 547#end 548 549class RcsFile: 550 551 def __init__(self, fname): 552 self.fname = fname 553 self.versions = [] 554 self.state = HEAD_STATE 555 #end 556 557 def SetTotRev(self,s): 558 self.totrev = int(s) 559 #end 560 561 def Rev(self,s): 562 self.rev = RcsVersion(s) 563 if len(self.versions) >= self.totrev: 564 raise SkipRcsException('too many versions (in log messages)') 565 #end 566 self.versions.append(self.rev) 567 #end 568 569 def Date(self,s): 570 self.rev.date = s 571 #end 572 573 def Match(self, line, state, rx, gp, newstate, f): 574 if state == self.state: 575 m = rx.match(line) 576 if m: 577 if f: 578 f(m.group(gp)) 579 #end 580 self.state = newstate 581 return 1 582 #end 583 #end 584 return None 585 #end 586 587 def Sum1Rlog(self): 588 f = os.popen('rlog '+self.fname, "r") 589 l = f.readline() 590 while l: 591 if self.Match(l, HEAD_STATE, RE_TOTREV, 1, BAR_STATE, self.SetTotRev): 592 pass 593 elif self.Match(l, BAR_STATE, RE_BAR, 1, REV_STATE, None): 594 pass 595 elif self.Match(l, REV_STATE, RE_REV, 1, DATE_STATE, self.Rev): 596 pass 597 elif self.Match(l, DATE_STATE, RE_DATE, 1, BAR_STATE, self.Date): 598 pass 599 #end 600 l = f.readline() 601 #end 602 c = f.close() 603 if c != None: 604 raise c 605 #end 606 #end 607 608 def Sum1(self): 609 st = os.stat(self.fname) 610 self.rcssize = st.st_size 611 self.Sum1Rlog() 612 if self.totrev != len(self.versions): 613 raise SkipRcsException('wrong version count') 614 #end 615 self.versions.sort() 616 #end 617 618 def Checkout(self,n): 619 v = self.versions[n] 620 out = open(self.Verf(n), "w") 621 cmd = 'co -ko -p%s %s' % (v.vstr, self.fname) 622 total = 0 623 (inf, 624 stream, 625 err) = os.popen3(cmd, "r") 626 inf.close() 627 buf = stream.read() 628 while buf: 629 total = total + len(buf) 630 out.write(buf) 631 buf = stream.read() 632 #end 633 v.vsize = total 634 estr = '' 635 buf = err.read() 636 while buf: 637 estr = estr + buf 638 buf = err.read() 639 #end 640 if stream.close(): 641 raise CommandError(cmd, 'checkout failed: %s\n%s\n%s' % (v.vstr, self.fname, estr)) 642 #end 643 out.close() 644 err.close() 645 #end 646 647 def Vdate(self,n): 648 return self.versions[n].date 649 #end 650 651 def Vstr(self,n): 652 return self.versions[n].vstr 653 #end 654 655 def Verf(self,n): 656 return os.path.join(TMPDIR, 'input.%d' % n) 657 #end 658 659 def FilePairsByDate(self, runclass): 660 if self.totrev < 2: 661 raise NotEnoughVersions() 662 #end 663 self.Checkout(0) 664 ntrials = [] 665 if self.totrev < 2: 666 return vtrials 667 #end 668 for v in range(0,self.totrev-1): 669 if v > 1: 670 os.remove(self.Verf(v-1)) 671 #end 672 self.Checkout(v+1) 673 if os.stat(self.Verf(v)).st_size < MIN_SIZE or \ 674 os.stat(self.Verf(v+1)).st_size < MIN_SIZE: 675 continue 676 #end 677 678 result = TimedTest(self.Verf(v+1), 679 self.Verf(v), 680 runclass.New()) 681 682 target_size = os.stat(self.Verf(v+1)).st_size 683 684 ntrials.append(result) 685 #end 686 687 os.remove(self.Verf(self.totrev-1)) 688 os.remove(self.Verf(self.totrev-2)) 689 return ntrials 690 #end 691 692 def AppendVersion(self, f, n): 693 self.Checkout(n) 694 rf = open(self.Verf(n), "r") 695 data = rf.read() 696 f.write(data) 697 rf.close() 698 return len(data) 699 #end 700 701class RcsFinder: 702 def __init__(self): 703 self.subdirs = [] 704 self.rcsfiles = [] 705 self.others = [] 706 self.skipped = [] 707 self.biground = 0 708 #end 709 710 def Scan1(self,dir): 711 dents = os.listdir(dir) 712 subdirs = [] 713 rcsfiles = [] 714 others = [] 715 for dent in dents: 716 full = os.path.join(dir, dent) 717 if os.path.isdir(full): 718 subdirs.append(full) 719 elif dent[len(dent)-2:] == ",v": 720 rcsfiles.append(RcsFile(full)) 721 else: 722 others.append(full) 723 #end 724 #end 725 self.subdirs = self.subdirs + subdirs 726 self.rcsfiles = self.rcsfiles + rcsfiles 727 self.others = self.others + others 728 return subdirs 729 #end 730 731 def Crawl(self, dir): 732 subdirs = [dir] 733 while subdirs: 734 s1 = self.Scan1(subdirs[0]) 735 subdirs = subdirs[1:] + s1 736 #end 737 #end 738 739 def Summarize(self): 740 good = [] 741 for rf in self.rcsfiles: 742 try: 743 rf.Sum1() 744 if rf.totrev < 2: 745 raise SkipRcsException('too few versions (< 2)') 746 #end 747 except SkipRcsException, e: 748 #print 'skipping file %s: %s' % (rf.fname, e.reason) 749 self.skipped.append(rf) 750 else: 751 good.append(rf) 752 #end 753 self.rcsfiles = good 754 #end 755 756 def AllPairsByDate(self, runclass): 757 results = [] 758 good = [] 759 for rf in self.rcsfiles: 760 try: 761 results = results + rf.FilePairsByDate(runclass) 762 except SkipRcsException: 763 print 'file %s has compressed versions: skipping' % (rf.fname) 764 except NotEnoughVersions: 765 print 'testing %s on %s: not enough versions' % (runclass, rf.fname) 766 else: 767 good.append(rf) 768 #end 769 self.rcsfiles = good 770 self.ReportPairs(runclass, results) 771 return results 772 #end 773 774 def ReportPairs(self, name, results): 775 encode_time = 0 776 decode_time = 0 777 encode_size = 0 778 for r in results: 779 encode_time += r.encode_time.mean 780 decode_time += r.decode_time.mean 781 encode_size += r.encode_size 782 #end 783 print '%s rcs: encode %.2f s: decode %.2f s: size %d' % \ 784 (name, encode_time, decode_time, encode_size) 785 #end 786 787 def MakeBigFiles(self, rand): 788 f1 = open(TMPDIR + "/big.1", "w") 789 f2 = open(TMPDIR + "/big.2", "w") 790 population = [] 791 for file in self.rcsfiles: 792 if len(file.versions) < 2: 793 continue 794 population.append(file) 795 #end 796 f1sz = 0 797 f2sz = 0 798 fcount = int(len(population) * FILE_P) 799 assert fcount > 0 800 for file in rand.sample(population, fcount): 801 m = IGNORE_FILENAME.match(file.fname) 802 if m != None: 803 continue 804 #end 805 r1, r2 = rand.sample(xrange(0, len(file.versions)), 2) 806 f1sz += file.AppendVersion(f1, r1) 807 f2sz += file.AppendVersion(f2, r2) 808 #m.update('%s,%s,%s ' % (file.fname[len(RCSDIR):], 809 #file.Vstr(r1), file.Vstr(r2))) 810 #end 811 testkey = 'rcs%d' % self.biground 812 self.biground = self.biground + 1 813 814 print '%s; source %u bytes; target %u bytes' % (testkey, f1sz, f2sz) 815 f1.close() 816 f2.close() 817 return (TMPDIR + "/big.1", 818 TMPDIR + "/big.2", 819 testkey) 820 #end 821 822 def Generator(self): 823 return lambda rand: self.MakeBigFiles(rand) 824 #end 825#end 826 827# find a set of RCS files for testing 828def GetTestRcsFiles(): 829 rcsf = RcsFinder() 830 rcsf.Crawl(RCSDIR) 831 if len(rcsf.rcsfiles) == 0: 832 raise CommandError('', 'no RCS files') 833 #end 834 rcsf.Summarize() 835 print "rcsfiles: rcsfiles %d; subdirs %d; others %d; skipped %d" % ( 836 len(rcsf.rcsfiles), 837 len(rcsf.subdirs), 838 len(rcsf.others), 839 len(rcsf.skipped)) 840 print StatList([x.rcssize for x in rcsf.rcsfiles], "rcssize").str 841 print StatList([x.totrev for x in rcsf.rcsfiles], "totrev").str 842 return rcsf 843#end 844 845class SampleDataTest: 846 def __init__(self, dirs): 847 dirs_in = dirs 848 self.pairs = [] 849 while dirs: 850 d = dirs[0] 851 dirs = dirs[1:] 852 l = os.listdir(d) 853 files = [] 854 for e in l: 855 p = os.path.join(d, e) 856 if os.path.isdir(p): 857 dirs.append(p) 858 else: 859 files.append(p) 860 #end 861 #end 862 if len(files) > 1: 863 files.sort() 864 for x in xrange(len(files)): 865 for y in xrange(len(files)): 866 self.pairs.append((files[x], files[y], 867 '%s-%s' % (files[x], files[y]))) 868 #end 869 #end 870 #end 871 #end 872 print "Sample data test using %d file pairs in %s" % ( 873 len(self.pairs), dirs_in) 874 #end 875 876 def Generator(self): 877 return lambda rand: rand.choice(self.pairs) 878 #end 879#end 880 881# configs are represented as a list of values, 882# program takes a list of strings: 883def ConfigToArgs(config): 884 args = [ '-C', 885 ','.join([str(x) for x in config[0:SOFT_CONFIG_CNT]])] 886 for i in range(SOFT_CONFIG_CNT, len(CONFIG_ORDER)): 887 key = CONFIG_ARGMAP[CONFIG_ORDER[i]] 888 val = config[i] 889 if val == 'true' or val == 'false': 890 if val == 'true': 891 args.append('%s' % key) 892 #end 893 else: 894 args.append('%s=%s' % (key, val)) 895 #end 896 #end 897 return args 898#end 899 900# 901class RandomTest: 902 def __init__(self, tnum, tinput, config, syntuple = None): 903 self.mytinput = tinput[2] 904 self.myconfig = config 905 self.tnum = tnum 906 907 if syntuple != None: 908 self.runtime = syntuple[0] 909 self.compsize = syntuple[1] 910 self.decodetime = None 911 else: 912 args = ConfigToArgs(config) 913 result = TimedTest(tinput[1], tinput[0], Xdelta3Runner(args)) 914 915 self.runtime = result.encode_time.mean 916 self.compsize = result.encode_size 917 self.decodetime = result.decode_time.mean 918 #end 919 920 self.score = None 921 self.time_pos = None 922 self.size_pos = None 923 self.score_pos = None 924 #end 925 926 def __str__(self): 927 decodestr = ' %s' % self.decodetime 928 return 'time %.6f%s size %d%s << %s >>%s' % ( 929 self.time(), ((self.time_pos != None) and 930 (" (%s)" % self.time_pos) or ""), 931 self.size(), ((self.size_pos != None) and 932 (" (%s)" % self.size_pos) or ""), 933 c2str(self.config()), 934 decodestr) 935 #end 936 937 def time(self): 938 return self.runtime 939 #end 940 941 def size(self): 942 return self.compsize 943 #end 944 945 def config(self): 946 return self.myconfig 947 #end 948 949 def score(self): 950 return self.score 951 #end 952 953 def tinput(self): 954 return self.mytinput 955 #end 956#end 957 958def PosInAlist(l, e): 959 for i in range(0, len(l)): 960 if l[i][1] == e: 961 return i; 962 #end 963 #end 964 return -1 965#end 966 967# Generates a set of num_results test configurations, given the list of 968# retest-configs. 969def RandomTestConfigs(rand, input_configs, num_results): 970 971 outputs = input_configs[:] 972 have_set = dict([(c,c) for c in input_configs]) 973 974 # Compute a random configuration 975 def RandomConfig(): 976 config = [] 977 cmap = {} 978 for key in CONFIG_ORDER: 979 val = cmap[key] = (INPUT_SPEC(rand)[key])(cmap) 980 config.append(val) 981 #end 982 return tuple(config) 983 #end 984 985 while len(outputs) < num_results: 986 newc = None 987 for i in xrange(100): 988 c = RandomConfig() 989 if have_set.has_key(c): 990 continue 991 #end 992 have_set[c] = c 993 newc = c 994 break 995 if newc is None: 996 print 'stopped looking for configs at %d' % len(outputs) 997 break 998 #end 999 outputs.append(c) 1000 #end 1001 outputs.sort() 1002 return outputs 1003#end 1004 1005def RunOptimizationLoop(rand, generator, rounds): 1006 configs = [] 1007 for rnum in xrange(rounds): 1008 configs = RandomTestConfigs(rand, configs, MAX_RESULTS) 1009 tinput = generator(rand) 1010 tests = [] 1011 for x in xrange(len(configs)): 1012 t = RandomTest(x, tinput, configs[x]) 1013 print 'Round %d test %d: %s' % (rnum, x, t) 1014 tests.append(t) 1015 #end 1016 results = ScoreTests(tests) 1017 1018 for r in results: 1019 c = r.config() 1020 if not test_all_config_results.has_key(c): 1021 test_all_config_results[c] = [r] 1022 else: 1023 test_all_config_results[c].append(r) 1024 #end 1025 #end 1026 1027 #GraphResults('expt%d' % rnum, results) 1028 #GraphSummary('sum%d' % rnum, results) 1029 1030 # re-test some fraction 1031 configs = [r.config() for r in results[0:int(MAX_RESULTS * KEEP_P)]] 1032 #end 1033#end 1034 1035# TODO: cleanup 1036test_all_config_results = {} 1037 1038def ScoreTests(results): 1039 scored = [] 1040 timed = [] 1041 sized = [] 1042 1043 t_min = float(min([test.time() for test in results])) 1044 #t_max = float(max([test.time() for test in results])) 1045 s_min = float(min([test.size() for test in results])) 1046 #s_max = float(max([test.size() for test in results])) 1047 1048 for test in results: 1049 1050 # Hyperbolic function. Smaller scores still better 1051 red = 0.999 # minimum factors for each dimension are 1/1000 1052 test.score = ((test.size() - s_min * red) * 1053 (test.time() - t_min * red)) 1054 1055 scored.append((test.score, test)) 1056 timed.append((test.time(), test)) 1057 sized.append((test.size(), test)) 1058 #end 1059 1060 scored.sort() 1061 timed.sort() 1062 sized.sort() 1063 1064 best_by_size = [] 1065 best_by_time = [] 1066 1067 pos = 0 1068 for (score, test) in scored: 1069 pos += 1 1070 test.score_pos = pos 1071 #end 1072 1073 scored = [x[1] for x in scored] 1074 1075 for test in scored: 1076 test.size_pos = PosInAlist(sized, test) 1077 test.time_pos = PosInAlist(timed, test) 1078 #end 1079 1080 for test in scored: 1081 c = test.config() 1082 s = 0.0 1083 print 'H-Score: %0.9f %s' % (test.score, test) 1084 #end 1085 1086 return scored 1087#end 1088 1089def GraphResults(desc, results): 1090 f = open("data-%s.csv" % desc, "w") 1091 for r in results: 1092 f.write("%0.9f\t%d\t# %s\n" % (r.time(), r.size(), r)) 1093 #end 1094 f.close() 1095 os.system("./plot.sh data-%s.csv plot-%s.jpg" % (desc, desc)) 1096#end 1097 1098def GraphSummary(desc, results_ignore): 1099 test_population = 0 1100 config_ordered = [] 1101 1102 # drops duplicate test/config pairs (TODO: don't retest them) 1103 for config, cresults in test_all_config_results.items(): 1104 input_config_map = {} 1105 uniq = [] 1106 for test in cresults: 1107 assert test.config() == config 1108 test_population += 1 1109 key = test.tinput() 1110 if not input_config_map.has_key(key): 1111 input_config_map[key] = {} 1112 #end 1113 if input_config_map[key].has_key(config): 1114 print 'skipping repeat test %s vs. %s' % (input_config_map[key][config], test) 1115 continue 1116 #end 1117 input_config_map[key][config] = test 1118 uniq.append(test) 1119 #end 1120 config_ordered.append(uniq) 1121 #end 1122 1123 # sort configs descending by number of tests 1124 config_ordered.sort(lambda x, y: len(y) - len(x)) 1125 1126 print 'population %d: %d configs %d results' % \ 1127 (test_population, 1128 len(config_ordered), 1129 len(config_ordered[0])) 1130 1131 if config_ordered[0] == 1: 1132 return 1133 #end 1134 1135 # a map from test-key to test-list w/ various configs 1136 input_set = {} 1137 osize = len(config_ordered) 1138 1139 for i in xrange(len(config_ordered)): 1140 config = config_ordered[i][0].config() 1141 config_tests = config_ordered[i] 1142 1143 #print '%s has %d tested inputs' % (config, len(config_tests)) 1144 1145 if len(input_set) == 0: 1146 input_set = dict([(t.tinput(), [t]) for t in config_tests]) 1147 continue 1148 #end 1149 1150 # a map from test-key to test-list w/ various configs 1151 update_set = {} 1152 for r in config_tests: 1153 t = r.tinput() 1154 if input_set.has_key(t): 1155 update_set[t] = input_set[t] + [r] 1156 else: 1157 #print 'config %s does not have test %s' % (config, t) 1158 pass 1159 #end 1160 #end 1161 1162 if len(update_set) <= 1: 1163 break 1164 #end 1165 1166 input_set = update_set 1167 1168 # continue if there are more w/ the same number of inputs 1169 if i < (len(config_ordered) - 1) and \ 1170 len(config_ordered[i + 1]) == len(config_tests): 1171 continue 1172 #end 1173 1174 # synthesize results for multi-test inputs 1175 config_num = None 1176 1177 # map of config to sum(various test-keys) 1178 smap = {} 1179 for (key, tests) in input_set.items(): 1180 if config_num == None: 1181 # config_num should be the same in all elements 1182 config_num = len(tests) 1183 smap = dict([(r.config(), 1184 (r.time(), 1185 r.size())) 1186 for r in tests]) 1187 else: 1188 # compuate the per-config sum of time/size 1189 assert config_num == len(tests) 1190 smap = dict([(r.config(), 1191 (smap[r.config()][0] + r.time(), 1192 smap[r.config()][1] + r.size())) 1193 for r in tests]) 1194 #end 1195 #end 1196 1197 if config_num == 1: 1198 continue 1199 #end 1200 1201 if len(input_set) == osize: 1202 break 1203 #end 1204 1205 summary = '%s-%d' % (desc, len(input_set)) 1206 osize = len(input_set) 1207 1208 print 'generate %s w/ %d configs' % (summary, config_num) 1209 syn = [RandomTest(0, (None, None, summary), config, 1210 syntuple = (smap[config][0], smap[config][1])) 1211 for config in smap.keys()] 1212 syn = ScoreTests(syn) 1213 #print 'smap is %s' % (smap,) 1214 #print 'syn is %s' % (' and '.join([str(x) for x in syn])) 1215 #GraphResults(summary, syn) 1216 #end 1217#end 1218 1219def RunRegressionTest(pairs, rounds): 1220 for args in [ 1221 [], 1222 ['-S=djw'], 1223 ['-B=412907520'], 1224 ['-B 412907520', ], 1225 1226 ]: 1227 print "Args %s" % (args) 1228 for (file1, file2, testkey) in pairs: 1229 ttest = TimedTest(file1, file2, Xdelta3Runner(args, forkexec=True), 1230 skip_trials = 0, 1231 min_trials = 1, 1232 max_trials = 1) 1233 print "Source %s\nTarget %s\nEncode %s\nDecode %s\nSize %s\n\n" % ( 1234 file1, file2, 1235 ttest.encode_time.str, 1236 ttest.decode_time.str, 1237 ttest.encode_size) 1238 #end 1239#end 1240 1241if __name__ == "__main__": 1242 try: 1243 RunCommand(['rm', '-rf', TMPDIR]) 1244 os.mkdir(TMPDIR) 1245 1246 #rcsf = GetTestRcsFiles() 1247 #generator = rcsf.Generator() 1248 1249 sample = SampleDataTest([SAMPLEDIR]) 1250 generator = sample.Generator() 1251 1252 rand = random.Random(135135135135135) 1253 1254 RunRegressionTest(sample.pairs, TEST_ROUNDS) 1255 1256 #RunSpeedTest() 1257 1258 # the idea below is to add the default configurations and 1259 # xdelta1 to the optimization loop: 1260 #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-3', '-6'])) 1261 #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9'])) 1262 #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-S', 'djw'])) 1263 #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-S', 'djw'])) 1264 #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-T'])) 1265 #x1r = rcsf.AllPairsByDate(Xdelta1RunClass()) 1266 1267 except CommandError: 1268 pass 1269 else: 1270 RunCommand(['rm', '-rf', TMPDIR]) 1271 pass 1272 #end 1273#end 1274