1#!/bin/sh
2
3
4#
5# This script does a binary search to look for the smallest input that will
6# break something.
7#
8
9# The folder for us to store our temporary files. There may be many of them.
10WORKDIR=tmp/search
11
12# This setting determines where we save the input we generate.
13# You should change it to something you have write access to.
14CLOUDINPUT=gs://jpmartin/hellbender-test-inputs/search_input.bam
15
16# file containing the known good output
17GOODOUTPUT=$WORKDIR/recaltable.correct
18
19# file containing the output under test.
20TESTOUTPUT=$WORKDIR/recaltable.cloud
21
22#
23# Set here the genomic interval you'd like to start from.
24#
25FIRST=1000250;
26LAST=1000312;
27
28
29# The code below assumes there's a "hb" script on the root that runs hellbender.
30# Its contents would be:
31#
32# #!/bin/sh
33# build/install/$HELLBENDER/bin/$HELLBENDER "$@"
34#
35# where you replace $HELLBENDER with the name of your hellbender folder
36# (typically just "hellbender")
37
38#
39# Generates the input to be tested.
40# It takes two arguments: the first and last element of the
41# interval.
42# The output will be saved to $CLOUDINPUT
43#
44genInput () {
45  INPUT=$WORKDIR/CEUTrio.test.ch20.$1-$2.bam
46  STDINPUT=$WORKDIR/search_input.bam
47  rm $STDINPUT
48  # keep a copy of every input that we generated
49  ./hb PrintReads -I src/test/resources/org/broadinstitute/hellbender/tools/BQSR/CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.bam -O $INPUT -L 20:$1-$2 2>&1 > $WORKDIR/inputgen.log || echo "input gen reports an error"
50  # Standard filename for the next stage
51  cp $INPUT $STDINPUT
52  samtools index $STDINPUT
53  # copy to $CLOUDINPUT so the next phases can get to it
54  gsutil cp $STDINPUT $CLOUDINPUT
55  gsutil acl set public-read $CLOUDINPUT
56  gsutil cp $STDINPUT.bai $CLOUDINPUT.bai
57  gsutil acl set public-read $CLOUDINPUT.bai
58}
59
60#
61# Generates the "known good" output.
62#
63genCorrectOutput () {
64  STDINPUT=$WORKDIR/search_input.bam
65  # change this line to run the known good version of the computation you're interested in.
66  ./hb BaseRecalibrator -R ../../sample-data/GRCh37.ch20.fixup.fa -I $STDINPUT --knownSites tstBQSR/dbsnp_132.b37.excluding_sites_after_129.chr17_69k_70k.vcf -sortAllCols --RECAL_TABLE_FILE $GOODOUTPUT 2>&1 > $WORKDIR/outputgen.log  || echo "correct output gen reports an error"
67}
68
69#
70# Runs the computation under test.
71#
72genTestOutput () {
73  # change this line to run the computation you're testing.
74  ./hb BaseRecalibratorDataflow --runner=BLOCKING --numWorkers=1 -R gg://reference/EOSsjdnTicvzwAE -I $CLOUDINPUT --baseRecalibrationKnownVariants tstBQSR/dbsnp_132.b37.excluding_sites_after_129.chr17_69k_70k.vcf --project=genomics-pipelines --staging=gs://jpmartin/staging-jpmartin -sortAllCols --RECAL_TABLE_FILE $TESTOUTPUT 2>&1 > $WORKDIR/testgen.log || echo "test output gen reports an error"
75}
76
77# This tries to shrink a failing interval by removing less than half.
78# (It runs after the binary search, so we know that removing half is too much)
79rogner() {
80# try to move the left side by 1/5th at a time
81while true; do
82  MID=$(( $FIRST + ($LAST - $FIRST) / 5 ))
83  if [ $FIRST -eq $MID ]; then
84    echo "Done shrinking the left"
85    break
86  fi
87  rm -f $GOODOUTPUT $TESTOUTPUT
88  genInput $MID $LAST
89  genCorrectOutput
90  genTestOutput
91  if [ ! -f $GOODOUTPUT ]; then
92    echo "missing correct output for $MID-$LAST."
93    break
94  fi
95  if [ ! -f $TESTOUTPUT ]; then
96    echo "Output missing (test) for $MID-$LAST."
97    FIRST=$MID;
98    continue
99  fi
100  if ! `diff $WORKDIR/recaltable.correct $TESTOUTPUT`; then
101    # they differ, explore this branch
102    echo "Output is wrong for $MID-$LAST, going deeper"
103    FIRST=$MID;
104  else
105    # they are the same, try the other half
106    echo "Output matches for $MID-$LAST, going to the right side"
107    break
108  fi
109done
110# right side now, again 1/5th at a time
111while true; do
112  MID=$(( $LAST - ($LAST - $FIRST) / 5 ))
113  if [ $LAST -eq $MID ]; then
114    echo "Done shrinking the right"
115    break
116  fi
117  rm -f $GOODOUTPUT $TESTOUTPUT
118  genInput $FIRST $MID
119  genCorrectOutput
120  genTestOutput
121  if [ ! -f $GOODOUTPUT ]; then
122    echo "missing correct output for $FIRST-$MID."
123    break
124  fi
125  if [ ! -f $TESTOUTPUT ]; then
126    echo "Output missing (test) for $FIRST-$MID."
127    LAST=$MID;
128    continue
129  fi
130  if ! `diff $GOODOUTPUT $TESTOUTPUT`; then
131    # they differ, explore this branch
132    echo "Output is wrong for $FIRST-$MID, going deeper"
133    LAST=$MID;
134  else
135    # they are the same, try the other half
136    echo "Output matches for $FIRST-$MID, done"
137    break
138  fi
139done
140
141echo "Output is now $FIRST-$LAST"
142}
143
144
145
146#
147# This runs a binary search to look for the smallest interval there the output
148# of the test program differs from the output of the good program.
149#
150binarySearch () {
151while true; do
152
153  MID=$(( ( $FIRST + $LAST ) / 2 ))
154  if [ $FIRST -eq $MID ]; then
155    echo "We have narrowed it down to just $FIRST-$LAST, we're done now."
156    break
157  fi
158  rm -f $GOODOUTPUT $TESTOUTPUT
159  genInput $FIRST $MID
160  genCorrectOutput
161  genTestOutput
162  if [ ! -f $GOODOUTPUT ]; then
163    echo "missing correct output for $FIRST-$MID."
164    break
165  fi
166  if [ ! -f $TESTOUTPUT ]; then
167    echo "missing test output for $FIRST-$MID."
168  fi
169  if ! `diff $GOODOUTPUT $TESTOUTPUT`; then
170    # they differ, explore this branch
171    echo "Output is wrong for $FIRST-$MID, going deeper"
172    LAST=$MID;
173  else
174    # they are the same, try the other half
175    echo "Output is correct for $FIRST-$MID, trying the latter half"
176    rm -f $GOODOUTPUT $TESTOUTPUT
177    genInput $MID $LAST
178    genCorrectOutput
179    genTestOutput
180    if [ ! -f $GOODOUTPUT ]; then
181      echo "missing correct output for $MID-$LAST."
182      break
183    fi
184    if [ ! -f $TESTOUTPUT ]; then
185      echo "missing test output for $MID-$LAST."
186    fi
187    if ! `diff $GOODOUTPUT $TESTOUTPUT`; then
188      # they differ, explore this branch
189      echo "Output is wrong for $MID-$LAST, going deeper"
190      FIRST=$MID;
191    else
192      echo "Output is correct for $MID-$LAST too!"
193      # they are the same, too!
194      break
195    fi
196  fi
197
198done
199}
200
201
202# create the work directory if it's not already there
203[ -e $WORKDIR ] || mkdir -p $WORKDIR
204
205# shrink the range, halving every time
206binarySearch
207# if there is still something left, try removing 1/5th at a time
208rogner
209