1# -*-Perl-*- Test Harness script for Bioperl 2# $Id$ 3 4use strict; 5 6BEGIN { 7 use Bio::Root::Test; 8 9 test_begin( -tests => 149 ); 10 11 use_ok('Bio::SeqIO::fastq'); 12 use_ok('Bio::Seq::Quality'); 13} 14 15my $DEBUG = test_debug(); 16 17# simple parsing, data conversion of fastq example files 18 19my %example_files = ( 20 bug2335 => { 21 'variant' => 'sanger', 22 'seq' => 'TTGGAATGTTGCAAATGGGAGGCAGTTTGAAATACTGAATAGGCCTCATC'. 23 'GAGAATGTGAAGTTTCAGTAAAGACTTGAGGAAGTTGAATGAGCTGATGA'. 24 'ATGGATATATG', 25 'qual' => '31 23 32 23 31 22 27 28 32 24 25 23 30 25 2 21 33 '. 26 '29 9 17 33 27 27 27 25 33 29 9 28 32 27 7 27 21 '. 27 '26 21 27 27 17 26 23 31 23 32 24 27 27 28 27 28 '. 28 '28 27 27 31 23 23 28 27 27 32 23 27 35 30 12 28 '. 29 '27 27 25 33 29 10 27 28 28 33 25 27 27 31 23 34 '. 30 '27 27 32 24 27 30 22 24 28 24 27 28 27 26 28 27 '. 31 '28 32 24 28 33 25 23 27 27 28 27 28 26', 32 'display_id' => 'DS6BPQV01A2G0A', 33 'desc' => undef, 34 'count' => 1 35 }, 36 RT98876 => { 37 'variant' => 'sanger', 38 'seq' => 'CCGCCATTTCTTCAAATCTTTTCTTTTCTTTAGGAGTCATCAATTTCCAT'. 39 'TTCTCTGCACATTTCTTTGAAAATTA', 40 'qual' => '31 34 34 34 34 34 34 34 34 33 34 34 34 34 34 34 '. 41 '34 34 34 34 34 32 32 34 34 34 34 34 34 34 34 34 '. 42 '30 34 34 34 34 34 34 34 34 34 34 34 34 34 32 32 '. 43 '34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 '. 44 '34 34 34 34 33 30 30 27 33 34 29 2', 45 'display_id' => 'Illumina_SRR125365.38', 46 'desc' => 's_5_1_0001_qseq_37 length=76', 47 'count' => 1 48 }, 49 test1_sanger => { 50 'variant' => 'sanger', 51 'seq' => 'TATTGACAATTGTAAGACCACTAAGGATTTTTGGGCGGCAGCGACTTGGA'. 52 'GCTCTTGTAAAAGCGCACTGCGTTCCTTTTCTTTATTCTTTTGATCTTGA'. 53 'GAATCTTCTAAAAATGCCGAAAAGAAATGTTGGGAAGAGAGCGTAATCAG'. 54 'TTTAGAAATGCTCTTGATGGTAGCTTTATGTTGATCCATTCTTCTGCCTC'. 55 'CTTTACGAATAAAATAGAATAAAACTCAAATGACTAATTACCTGTATTTT'. 56 'ACCTAATTTTGTGATAAAATTCAAGAAAATATGTTCGCCTTCAATAATTA'. 57 'TG', 58 'qual' => '37 37 37 37 37 37 37 37 37 37 37 40 38 40 40 37 '. 59 '37 37 39 39 40 39 39 39 39 39 37 33 33 33 33 33 '. 60 '39 39 34 29 28 28 38 39 39 39 39 39 39 37 37 37 '. 61 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '. 62 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 '. 63 '38 29 29 29 34 38 37 37 33 33 33 33 37 37 37 37 '. 64 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '. 65 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '. 66 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '. 67 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '. 68 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 '. 69 '38 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '. 70 '37 37 37 37 37 37 37 37 37 34 34 34 38 38 37 37 '. 71 '37 37 37 37 37 37 37 37 40 40 40 40 38 38 38 38 '. 72 '40 40 40 38 38 38 40 40 40 40 40 40 40 40 40 40 '. 73 '38 38 38 38 38 32 25 25 25 25 30 30 31 32 32 31 '. 74 '31 31 31 31 31 31 31 31 19 19 19 19 19 22 22 31 '. 75 '31 31 31 31 31 31 31 32 32 31 32 31 31 31 31 31 '. 76 '31 25 25 25 28 28 30 30 30 30 30 31 31 32', 77 'display_id' => 'SRR005406.250', 78 'desc' => 'FB9GE3J10F6I2T length=302', 79 'count' => 250 80 }, 81 test2_solexa => { 82 'variant' => 'solexa', 83 'seq' => 'GTATTATTTAATGGCATACACTCAA', 84 'qual' => '25 25 25 25 25 25 25 25 25 25 23 25 25 25 25 23 '. 85 '25 23 23 21 23 23 23 17 17', 86 'display_id' => 'SLXA-B3_649_FC8437_R1_1_1_183_714', 87 'desc' => undef, 88 'count' => 5 89 }, 90 test3_illumina => { 91 'variant' => 'illumina', 92 'seq' => 'CCAAATCTTGAATTGTAGCTCCCCT', 93 'qual' => '15 19 24 15 17 24 24 24 24 24 19 24 24 21 24 24 '. 94 '20 24 24 24 24 20 18 13 19', 95 'display_id' => 'FC12044_91407_8_200_285_136', 96 'desc' => undef, 97 'count' => 25 98 }, 99 example => { 100 'variant' => 'sanger', # TODO: guessing on the format here... 101 'seq' => 'GTTGCTTCTGGCGTGGGTGGGGGGG', 102 'qual' => '26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 '. 103 '13 22 26 18 24 18 18 18 18', 104 'display_id' => 'EAS54_6_R1_2_1_443_348', 105 'desc' => undef, 106 'count' => 3 107 }, 108 illumina_faked => { 109 'variant' => 'illumina', 110 'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN', 111 'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 '. 112 '21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0', 113 'display_id' => 'Test', 114 'desc' => 'PHRED qualities from 40 to 0 inclusive', 115 'count' => 1 116 }, 117 sanger_93 => { 118 'variant' => 'sanger', 119 'seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAC'. 120 'TGACTGACTGACTGACTGACTGACTGACTGACTGAN', 121 'qual' => '93 92 91 90 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 '. 122 '74 73 72 71 70 69 68 67 66 65 64 63 62 61 60 59 58 57 56 '. 123 '55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 '. 124 '36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 '. 125 '17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0', 126 'display_id' => 'Test', 127 'desc' => 'PHRED qualities from 93 to 0 inclusive', 128 'count' => 1 129 }, 130 sanger_faked => { 131 'variant' => 'sanger', 132 'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN', 133 'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 '. 134 '21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0', 135 'display_id' => 'Test', 136 'desc' => 'PHRED qualities from 40 to 0 inclusive', 137 'count' => 1 138 }, 139 solexa_example => { 140 'variant' => 'solexa', 141 'seq' => 'GTATTATTTAATGGCATACACTCAA', 142 'qual' => '25 25 25 25 25 25 25 25 25 25 23 25 25 25 25 23 '. 143 '25 23 23 21 23 23 23 17 17', 144 'display_id' => 'SLXA-B3_649_FC8437_R1_1_1_183_714', 145 'desc' => undef, 146 'count' => 5 147 }, 148 solexa_faked => { 149 'variant' => 'solexa', 150 'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN', 151 'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 '. 152 '24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 '. 153 '8 7 6 5 5 4 4 3 3 2 2 1 1', 154 'display_id' => 'slxa_0001_1_0001_01', 155 'desc' => undef, 156 'count' => 1 157 }, 158 tricky => { 159 'variant' => 'sanger', # TODO: guessing on the format here... 160 'seq' => 'TGGGAGGTTTTATGTGGAAAGCAGCAATGTACAAGA', 161 'qual' => '40 40 40 40 40 40 40 13 40 40 40 40 40 40 16 31 '. 162 '19 19 31 12 22 13 4 27 5 10 14 3 14 4 19 7 10 10 '. 163 '7 4', 164 'display_id' => '071113_EAS56_0053:1:3:990:501', 165 'desc' => undef, 166 'count' => 4 167 }, 168 evil_wrapping => { 169 'variant' => 'sanger', # TODO: guessing on the format here... 170 'seq' => 'AACCCGTCCCATCAAAGATTTTGGTTGGAACCCGAAAGGGTTTTGAATTC'. 171 'AAACCCCTTTCGGTTCCAACTATTCAATTGTTTAACTTTTTTTAAATTGA'. 172 'TGGTCTGTTGGACCATTTGTAATAATCCCCATCGGAATTTCTTT', 173 'qual' => '32 26 31 26 4 22 20 30 25 2 27 27 24 36 32 16 '. 174 '26 28 36 32 18 4 33 26 33 26 32 26 33 26 31 26 '. 175 '4 24 36 32 16 36 32 16 36 32 18 4 27 33 26 32 26 '. 176 '23 36 32 15 35 31 18 3 36 32 16 28 33 26 32 26 33 '. 177 '26 33 26 25 28 25 33 26 25 33 25 32 24 25 36 32 '. 178 '15 32 24 27 37 32 23 16 10 5 1 35 30 12 33 26 19 '. 179 '27 25 25 14 27 26 28 25 32 24 23 12 20 30 21 28 '. 180 '34 29 10 23 27 27 18 26 28 19 25 35 32 18 4 27 26 '. 181 '28 23 12 24 13 32 28 8 25 33 28 9', 182 'display_id' => 'SRR014849.203935', 183 'desc' => 'EIXKN4201B4HU6 length=144', 184 'count' => 3 185 }, 186); 187 188for my $example (sort keys %example_files) { 189 my $file = test_input_file('fastq', "$example.fastq"); 190 my $variant = $example_files{$example}->{variant}; 191 my $in = Bio::SeqIO->new(-format => "fastq-$variant", 192 -file => $file, 193 -verbose => 2); #strictest level 194 my $ct = 0; 195 my $sample_seq; 196 eval { 197 while (my $seq = $in->next_seq) { 198 $ct++; 199 $sample_seq = $seq; # always grab the last seq 200 } 201 }; 202 ok(!$@, "$example parses"); 203 is($ct, $example_files{$example}->{count}, "correct num. seqs in $example"); 204 ok(defined($sample_seq), 'sample sequence obtained'); 205 if ($sample_seq) { 206 isa_ok($sample_seq, 'Bio::Seq::Quality'); 207 for my $method (qw(seq desc display_id)) { 208 is($sample_seq->$method, 209 $example_files{$example}->{$method}, 210 "$method() matches $example"); 211 } 212 is(join(' ', map {sprintf("%.0f", $_)} @{$sample_seq->qual}), 213 $example_files{$example}->{qual}, 214 "qual() matches $example"); 215 my $truncated = $sample_seq->trunc(1,10); 216 is(scalar(@{$truncated->meta}), $truncated->length); 217 } 218} 219 220# test round-trip and conversions (single file of each type) 221 222my @variants = qw(sanger illumina solexa); 223 224my %conversion = ( # check conversions, particularly solexa 225 sanger_93 => { 226 'variant' => 'sanger', 227 'to_solexa' => { 228 '-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN', 229 '-qual' => [ (map {62} 0..31),(reverse(1..61)),1 ], 230 '-raw_quality' => '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;', 231 '-id' => 'Test', 232 '-desc' => 'PHRED qualities from 93 to 0 inclusive', 233 '-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive' 234 }, 235 'to_illumina' => { 236 '-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN', 237 '-qual' => [ (map {62} 0..31),(reverse(0..61)) ], 238 '-raw_quality' => '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@', 239 '-id' => 'Test', 240 '-desc' => 'PHRED qualities from 93 to 0 inclusive', 241 '-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive' 242 }, 243 'to_sanger' => { 244 '-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN', 245 '-qual' => [reverse(0..93)], 246 '-raw_quality' => '~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)(\'&%$#"!', 247 '-id' => 'Test', 248 '-desc' => 'PHRED qualities from 93 to 0 inclusive', 249 '-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive' 250 }, 251 }, 252 solexa_faked => { 253 'variant' => 'solexa', 254 'to_solexa' => {'-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN', 255 '-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)], 256 '-raw_quality' => 'hgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;', 257 '-id' => 'slxa_0001_1_0001_01', 258 '-desc' => '', 259 '-descriptor' => 'slxa_0001_1_0001_01' 260 }, 261 'to_illumina' => { 262 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN', 263 '-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)], 264 '-namespace' => 'solexa', 265 '-raw_quality' => 'hgfedcba`_^]\\[ZYXWVUTSRQPONMLKJJIHGFEEDDCCBBAA', 266 '-id' => 'slxa_0001_1_0001_01', 267 '-desc' => '', 268 '-descriptor' => 'slxa_0001_1_0001_01' 269 }, 270 'to_sanger' => { 271 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN', 272 '-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)], 273 '-namespace' => 'solexa', 274 '-raw_quality' => 'IHGFEDCBA@?>=<;:9876543210/.-,++*)(\'&&%%$$##""', 275 '-id' => 'slxa_0001_1_0001_01', 276 '-desc' => '', 277 '-descriptor' => 'slxa_0001_1_0001_01' 278 }, 279 }, 280 illumina_faked => { 281 'variant' => 'illumina', 282 'to_solexa' => { 283 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN', 284 '-qual' => [reverse(1..40), 1], # round trip from solexa is lossy 285 '-namespace' => 'illumina', 286 '-raw_quality' => 'hgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;', 287 '-id' => 'Test', 288 '-desc' => 'PHRED qualities from 40 to 0 inclusive', 289 '-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive' 290 }, 291 'to_illumina' => { 292 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN', 293 '-qual' => [reverse(0..40)], 294 '-raw_quality' => 'hgfedcba`_^]\\[ZYXWVUTSRQPONMLKJIHGFEDCBA@', 295 '-id' => 'Test', 296 '-desc' => 'PHRED qualities from 40 to 0 inclusive', 297 '-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive' 298 }, 299 'to_sanger' => { 300 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN', 301 '-qual' => [reverse(0..40)], 302 '-raw_quality' => 'IHGFEDCBA@?>=<;:9876543210/.-,+*)(\'&%$#"!', 303 '-id' => 'Test', 304 '-desc' => 'PHRED qualities from 40 to 0 inclusive', 305 '-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive' 306 } 307 }, 308); 309 310for my $example (sort keys %conversion) { 311 my $file = test_input_file('fastq', "$example.fastq"); 312 my $variant = $conversion{$example}->{variant}; 313 my $in = Bio::SeqIO->new(-format => "fastq-$variant", 314 -file => $file, 315 -verbose => 2); #strictest level 316 # this both tests the next_dataset method and helps check roundtripping 317 my $seq = $in->next_seq; 318 for my $newvar (@variants) { 319 next unless exists $conversion{$example}->{"to_$newvar"}; 320 my $outfile = test_output_file(); 321 Bio::SeqIO->new(-format => "fastq-$newvar", 322 -file => ">$outfile", 323 -verbose => -1)->write_seq($seq); 324 my $newdata = Bio::SeqIO->new(-format => "fastq-$newvar", 325 -file => $outfile)->next_dataset; 326 # round for simple comparison, get around floating pt comparison probs 327 328 if ($newvar eq 'solexa') { 329 $newdata->{-qual} = [map {sprintf("%.0f",$_)} @{$newdata->{-qual}}]; 330 } 331 332 #print Dumper($newdata) if $variant eq 'sanger' && $newvar eq 'illumina'; 333 334 $conversion{$example}->{"to_$newvar"}->{'-namespace'} = $newvar; 335 is_deeply($newdata, $conversion{$example}->{"to_$newvar"}, "Conversion from $variant to $newvar"); 336 } 337} 338 339# test fastq exception handling 340 341my %error = ( 342 # file name 343 error_diff_ids => { 344 variant => 'sanger', 345 exception => qr/doesn't\smatch\sseq\sdescriptor/xms, 346 }, 347 error_long_qual => { 348 variant => 'sanger', 349 exception => qr/doesn't\smatch\slength\sof\ssequence/xms, 350 }, 351 error_no_qual => { 352 variant => 'sanger', 353 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms, 354 }, 355 error_qual_del => { 356 variant => 'sanger', 357 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms, 358 }, 359 error_qual_escape => { 360 variant => 'sanger', 361 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms, 362 }, 363 error_qual_null => { 364 variant => 'sanger', 365 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms, 366 }, 367 error_qual_space => { 368 variant => 'sanger', 369 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms, 370 }, 371 error_qual_tab => { 372 variant => 'sanger', 373 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms, 374 }, 375 error_qual_unit_sep => { 376 variant => 'sanger', 377 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms, 378 }, 379 error_qual_vtab => { 380 variant => 'sanger', 381 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms, 382 }, 383 error_short_qual => { 384 variant => 'sanger', 385 exception => qr/doesn't\smatch\slength\sof\ssequence/, 386 }, 387 error_spaces => { 388 variant => 'sanger', 389 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms, 390 }, 391 error_tabs => { 392 variant => 'sanger', 393 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms, 394 }, 395 error_trunc_at_plus => { 396 variant => 'sanger', 397 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms, 398 }, 399 error_trunc_at_qual => { 400 variant => 'sanger', 401 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms, 402 }, 403 error_trunc_at_seq => { 404 variant => 'sanger', 405 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms, 406 }, 407 error_trunc_in_title => { 408 variant => 'sanger', 409 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms, 410 }, 411 error_trunc_in_seq => { 412 variant => 'sanger', 413 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms, 414 }, 415 error_trunc_in_plus => { 416 variant => 'sanger', 417 exception => qr/doesn't\smatch\sseq\s descriptor/xms, 418 }, 419 error_trunc_in_qual => { 420 variant => 'sanger', 421 exception => qr/doesn't\smatch\slength\sof\ssequence/xms, 422 }, 423); 424 425for my $example (sort keys %error) { 426 my $file = test_input_file('fastq', "$example.fastq"); 427 my $variant = $error{$example}->{variant}; 428 my $in = Bio::SeqIO->new(-format => "fastq-$variant", 429 -file => $file, 430 -verbose => 2); #strictest level 431 my $ct = 0; 432 throws_ok { while (my $seq = $in->next_seq) { 433 $ct++; 434 } } $error{$example}->{exception}, "Exception caught for $example"; 435} 436 437# fastq 438 439my $in = Bio::SeqIO->new(-format => 'fastq', 440 -file => test_input_file('fastq', 'zero_qual.fastq'), 441 -verbose => 2); # strictest level 442 443lives_and {my $seq = $in->next_seq; 444 is($seq->seq, 'G');} 'edge case; single 0 in quality fails'; 445 446 447 448