1# -*-Perl-*- Test Harness script for Bioperl 2# $Id: SearchIO_blast.t 14995 2008-11-16 06:20:00Z cjfields $ 3 4use strict; 5use warnings; 6 7BEGIN { 8 use Bio::Root::Test; 9 10 test_begin(-tests => 1389); 11 12 use_ok('Bio::SearchIO'); 13} 14 15SKIP: { 16 test_skip(-tests => 4, -requires_module => 'Path::Class'); 17 18 19 my $file = Path::Class::file(test_input_file('ecolitst.bls')); 20 my $f = sub { my ($file) = @_; Bio::SearchIO->new( -file => $file, -format => 'blast') }; 21 22 lives_ok(sub { $f->($file) } , 'Bio::SearchIO->new can handle a Path::Class object'); 23 isa_ok($f->($file), 'Bio::Root::IO'); 24 25 $file = Path::Class::dir(File::Spec->catfile(qw/t data/))->file('ecolitst.bls'); 26 27 lives_ok(sub { $f->($file) } , 'Bio::SearchIO->new can handle a Path::Class object'); 28 isa_ok($f->($file), 'Bio::Root::IO'); 29} 30 31my ( $searchio, $result, $iter, $hit, $hsp ); 32 33$searchio = Bio::SearchIO->new( 34 '-format' => 'blast', 35 '-file' => test_input_file('ecolitst.bls') 36); 37 38$result = $searchio->next_result; 39 40like($result->algorithm_reference, 41 qr/Gapped BLAST and PSI-BLAST: a new generation of protein database search/ 42); 43 44is( $result->database_name, 'ecoli.aa', 'database_name()' ); 45is( $result->database_entries, 4289 ); 46is( $result->database_letters, 1358990 ); 47 48is( $result->algorithm, 'BLASTP' ); 49like( $result->algorithm_version, qr/^2\.1\.3/ ); 50like( $result->query_name, 51qr/gi|1786183|gb|AAC73113.1| (AE000111) aspartokinase I,\s+homoserine dehydrogenase I [Escherichia coli]/ 52); 53is( $result->query_accession, 'AAC73113.1' ); 54is( $result->query_gi, 1786183 ); 55is( $result->query_length, 820 ); 56is( $result->get_statistic('kappa'), '0.135' ); 57is( $result->get_statistic('kappa_gapped'), '0.0410' ); 58is( $result->get_statistic('lambda'), '0.319' ); 59is( $result->get_statistic('lambda_gapped'), '0.267' ); 60is( $result->get_statistic('entropy'), '0.383' ); 61is( $result->get_statistic('entropy_gapped'), '0.140' ); 62 63is( $result->get_statistic('dbletters'), 1358990 ); 64is( $result->get_statistic('dbentries'), 4289 ); 65is( $result->get_statistic('effective_hsplength'), 47 ); 66is( $result->get_statistic('effectivespace'), 894675611 ); 67is( $result->get_parameter('matrix'), 'BLOSUM62' ); 68is( $result->get_parameter('gapopen'), 11 ); 69is( $result->get_parameter('gapext'), 1 ); 70is( $result->get_statistic('S2'), '92' ); 71is( $result->get_statistic('S2_bits'), '40.0' ); 72float_is( $result->get_parameter('expect'), '1.0e-03' ); 73is( $result->get_statistic('num_extensions'), '82424' ); 74is( $result->get_statistic('querylength'), 773 ); 75is( $result->get_statistic('effectivedblength'), 1157407 ); 76is( $result->get_statistic('effectivespaceused'), 894675611 ); 77 78my @valid = ( 79 [ 'gb|AAC73113.1|', 820, 'AAC73113', '0', 1567, 4058 ], 80 [ 'gb|AAC76922.1|', 810, 'AAC76922', '1e-91', 332, 850 ], 81 [ 'gb|AAC76994.1|', 449, 'AAC76994', '3e-47', 184, 467 ] 82); 83my $count = 0; 84while ( $hit = $result->next_hit ) { 85 my $d = shift @valid; 86 87 is( $hit->name, shift @$d ); 88 is( $hit->length, shift @$d ); 89 is( $hit->accession, shift @$d ); 90 float_is( $hit->significance, shift @$d ); 91 is( $hit->bits, shift @$d ); 92 is( $hit->raw_score, shift @$d ); 93 94 if ( $count == 0 ) { 95 my $hsps_left = 1; 96 while ( my $hsp = $hit->next_hsp ) { 97 is( $hsp->query->start, 1 ); 98 is( $hsp->query->end, 820 ); 99 is( $hsp->hit->start, 1 ); 100 is( $hsp->hit->end, 820 ); 101 is( $hsp->length('total'), 820 ); 102 is( $hsp->start('hit'), $hsp->hit->start ); 103 is( $hsp->end('query'), $hsp->query->end ); 104 is( $hsp->strand('sbjct'), $hsp->subject->strand ); # alias for hit 105 float_is( $hsp->evalue, 0.0 ); 106 is( $hsp->score, 4058 ); 107 is( $hsp->bits, 1567 ); 108 is( sprintf( "%.2f", $hsp->percent_identity ), 98.29 ); 109 is( sprintf( "%.4f", $hsp->frac_identical('query') ), 0.9829 ); 110 is( sprintf( "%.4f", $hsp->frac_identical('hit') ), 0.9829 ); 111 is( $hsp->gaps, 0 ); 112 is( $hsp->n, 1 ); 113 $hsps_left--; 114 } 115 is( $hsps_left, 0 ); 116 } 117 last if ( $count++ > @valid ); 118} 119is( @valid, 0 ); 120 121$searchio = Bio::SearchIO->new( 122 '-format' => 'blast', 123 '-file' => test_input_file('ecolitst.wublastp') 124); 125 126$result = $searchio->next_result; 127 128like($result->algorithm_reference, 129 qr/Gish, W. \(1996-2000\)/); 130 131is( $result->database_name, 'ecoli.aa' ); 132is( $result->database_letters, 1358990 ); 133is( $result->database_entries, 4289 ); 134is( $result->algorithm, 'BLASTP' ); 135like( $result->algorithm_version, qr/^2\.0MP\-WashU/ ); 136like( $result->query_name, 137qr/gi|1786183|gb|AAC73113.1| (AE000111) aspartokinase I,\s+homoserine dehydrogenase I [Escherichia coli]/ 138); 139is( $result->query_accession, 'AAC73113.1' ); 140 141is( $result->query_length, 820 ); 142is( $result->query_gi, 1786183 ); 143is( $result->get_statistic('kappa'), 0.136 ); 144is( $result->get_statistic('lambda'), 0.319 ); 145is( $result->get_statistic('entropy'), 0.384 ); 146is( $result->get_statistic('dbletters'), 1358990 ); 147is( $result->get_statistic('dbentries'), 4289 ); 148is( $result->get_parameter('matrix'), 'BLOSUM62' ); 149is( $result->get_statistic('Frame+0_lambda_used'), '0.319' ); 150is( $result->get_statistic('Frame+0_kappa_used'), '0.136' ); 151is( $result->get_statistic('Frame+0_entropy_used'), '0.384' ); 152 153is( $result->get_statistic('Frame+0_lambda_computed'), '0.319' ); 154is( $result->get_statistic('Frame+0_kappa_computed'), '0.136' ); 155is( $result->get_statistic('Frame+0_entropy_computed'), '0.384' ); 156 157is( $result->get_statistic('Frame+0_lambda_gapped'), '0.244' ); 158is( $result->get_statistic('Frame+0_kappa_gapped'), '0.0300' ); 159is( $result->get_statistic('Frame+0_entropy_gapped'), '0.180' ); 160 161@valid = ( 162 [ 'gb|AAC73113.1|', 820, 'AAC73113', '0', 4141 ], 163 [ 'gb|AAC76922.1|', 810, 'AAC76922', '3.1e-86', 844 ], 164 [ 'gb|AAC76994.1|', 449, 'AAC76994', '2.8e-47', 483 ] 165); 166$count = 0; 167while ( $hit = $result->next_hit ) { 168 my $d = shift @valid; 169 170 if ( $count == 1 ) { 171 172 # Test HSP contig data returned by SearchUtils::tile_hsps() 173 # Second hit has two hsps that overlap. 174 175 # compare with the contig made by hand for these two contigs 176 # in t/data/contig-by-hand.wublastp 177 # (in this made-up file, the hsps from ecolitst.wublastp 178 # were aligned and contiged, and Length, Identities, Positives 179 # were counted, by a human (maj) ) 180 181 my $hand_hit = Bio::SearchIO->new( 182 -format => 'blast', 183 -file => test_input_file('contig-by-hand.wublastp') 184 )->next_result->next_hit; 185 my $hand_hsp = $hand_hit->next_hsp; 186 my @hand_qrng = $hand_hsp->range('query'); 187 my @hand_srng = $hand_hsp->range('hit'); 188 my @hand_matches = $hand_hit->matches; 189 190 my ( $qcontigs, $scontigs ) = Bio::Search::SearchUtils::tile_hsps($hit); 191 192 # Query contigs 193 is( $qcontigs->[0]->{'start'}, $hand_qrng[0] ); 194 is( $qcontigs->[0]->{'stop'}, $hand_qrng[1] ); 195 is( $qcontigs->[0]->{'iden'}, $hand_matches[0] ); 196 is( $qcontigs->[0]->{'cons'}, $hand_matches[1] ); 197 198 # Subject contigs 199 is( $scontigs->[0]->{'start'}, $hand_srng[0] ); 200 is( $scontigs->[0]->{'stop'}, $hand_srng[1] ); 201 is( $scontigs->[0]->{'iden'}, $hand_matches[0] ); 202 is( $scontigs->[0]->{'cons'}, $hand_matches[1] ); 203 } 204 205 is( $hit->name, shift @$d ); 206 is( $hit->length, shift @$d ); 207 is( $hit->accession, shift @$d ); 208 float_is( $hit->significance, shift @$d ); 209 is( $hit->raw_score, shift @$d ); 210 211 if ( $count == 0 ) { 212 my $hsps_left = 1; 213 while ( my $hsp = $hit->next_hsp ) { 214 is( $hsp->query->start, 1 ); 215 is( $hsp->query->end, 820 ); 216 is( $hsp->hit->start, 1 ); 217 is( $hsp->hit->end, 820 ); 218 is( $hsp->length('total'), 820 ); 219 220 float_is( $hsp->evalue, 0.0 ); 221 float_is( $hsp->pvalue, '0.0' ); 222 is( $hsp->score, 4141 ); 223 is( $hsp->bits, 1462.8 ); 224 is( $hsp->percent_identity, 100 ); 225 is( $hsp->frac_identical('query'), 1.00 ); 226 is( $hsp->frac_identical('hit'), 1.00 ); 227 is( $hsp->gaps, 0 ); 228 is( $hsp->n, 1 ); 229 $hsps_left--; 230 } 231 is( $hsps_left, 0 ); 232 } 233 last if ( $count++ > @valid ); 234} 235is( @valid, 0 ); 236 237# test that add hit really works properly for BLAST objects 238# bug 1611 239my @hits = $result->hits; 240$result->add_hit( $hits[0] ); 241is( $result->num_hits, @hits + 1 ); 242 243# test WU-BLAST -noseqs option 244$searchio = Bio::SearchIO->new( 245 '-format' => 'blast', 246 '-file' => test_input_file('ecolitst.noseqs.wublastp') 247); 248 249$result = $searchio->next_result; 250is( 251 $result->algorithm_reference, 'Gish, W. (1996-2004) http://blast.wustl.edu 252' 253); 254is( $result->database_name, 'ecoli.aa' ); 255is( $result->database_letters, 1358990 ); 256is( $result->database_entries, 4289 ); 257is( $result->algorithm, 'BLASTP' ); 258like( $result->algorithm_version, qr/^2\.0MP\-WashU/ ); 259like( $result->query_name, 260qr/gi|1786183|gb|AAC73113.1| (AE000111) aspartokinase I,\s+homoserine dehydrogenase I [Escherichia coli]/ 261); 262is( $result->query_accession, 'AAC73113.1' ); 263is( $result->query_gi, 1786183 ); 264 265is( $result->query_length, 820 ); 266is( $result->get_statistic('kappa'), 0.135 ); 267is( $result->get_statistic('lambda'), 0.319 ); 268is( $result->get_statistic('entropy'), 0.384 ); 269is( $result->get_statistic('dbletters'), 1358990 ); 270is( $result->get_statistic('dbentries'), 4289 ); 271is( $result->get_parameter('matrix'), 'BLOSUM62' ); 272is( $result->get_statistic('Frame+0_lambda_used'), '0.319' ); 273is( $result->get_statistic('Frame+0_kappa_used'), '0.135' ); 274is( $result->get_statistic('Frame+0_entropy_used'), '0.384' ); 275 276is( $result->get_statistic('Frame+0_lambda_computed'), '0.319' ); 277is( $result->get_statistic('Frame+0_kappa_computed'), '0.135' ); 278is( $result->get_statistic('Frame+0_entropy_computed'), '0.384' ); 279 280is( $result->get_statistic('Frame+0_lambda_gapped'), '0.244' ); 281is( $result->get_statistic('Frame+0_kappa_gapped'), '0.0300' ); 282is( $result->get_statistic('Frame+0_entropy_gapped'), '0.180' ); 283 284@valid = ( 285 [ 'gb|AAC73113.1|', 820, 'AAC73113', '0', 4141 ], 286 [ 'gb|AAC76922.1|', 810, 'AAC76922', '6.6e-93', 907 ], 287 [ 'gb|AAC76994.1|', 449, 'AAC76994', '2.8e-47', 483 ] 288); 289$count = 0; 290while ( $hit = $result->next_hit ) { 291 my $d = shift @valid; 292 293 is( $hit->name, shift @$d ); 294 is( $hit->length, shift @$d ); 295 is( $hit->accession, shift @$d ); 296 float_is( $hit->significance, shift @$d ); 297 is( $hit->raw_score, shift @$d ); 298 299 if ( $count == 0 ) { 300 my $hsps_left = 1; 301 while ( my $hsp = $hit->next_hsp ) { 302 is( $hsp->query->start, 1 ); 303 is( $hsp->query->end, 820 ); 304 is( $hsp->hit->start, 1 ); 305 is( $hsp->hit->end, 820 ); 306 is( $hsp->length('total'), 820 ); 307 308 float_is( $hsp->evalue, 0. ); 309 float_is( $hsp->pvalue, '0.' ); 310 is( $hsp->score, 4141 ); 311 is( $hsp->bits, 1462.8 ); 312 is( $hsp->percent_identity, 100 ); 313 is( $hsp->frac_identical('query'), 1.00 ); 314 is( $hsp->frac_identical('hit'), 1.00 ); 315 is( $hsp->gaps, 0 ); 316 is( $hsp->n, 1 ); 317 $hsps_left--; 318 } 319 is( $hsps_left, 0 ); 320 } 321 last if ( $count++ > @valid ); 322} 323is( @valid, 0 ); 324 325# test tblastx 326$searchio = Bio::SearchIO->new( 327 '-format' => 'blast', 328 '-file' => test_input_file('HUMBETGLOA.tblastx') 329); 330 331$result = $searchio->next_result; 332like($result->algorithm_reference,qr/Gapped BLAST and PSI-BLAST/); 333is( $result->database_name, 'ecoli.nt' ); 334is( $result->database_letters, 4662239 ); 335is( $result->database_entries, 400 ); 336is( $result->algorithm, 'TBLASTX' ); 337like( $result->algorithm_version, qr/^2\.1\.2/ ); 338is( $result->query_name, 'HUMBETGLOA' ); 339is( $result->query_description, 340 'Human haplotype C4 beta-globin gene, complete cds.' ); 341is( $result->query_length, 3002 ); 342is( $result->get_statistic('kappa'), 0.135 ); 343is( $result->get_statistic('lambda'), 0.318 ); 344is( $result->get_statistic('entropy'), 0.401 ); 345is( $result->get_statistic('dbletters'), 4662239 ); 346is( $result->get_statistic('dbentries'), 400 ); 347is( $result->get_statistic('querylength'), 953 ); 348is( $result->get_statistic('effectivedblength'), 1535279 ); 349is( $result->get_statistic('effectivespace'), 1463120887 ); 350is( $result->get_statistic('effectivespaceused'), 1463120887 ); 351is( $result->get_statistic('T'), 13 ); 352is( $result->get_statistic('X1'), 16 ); 353is( $result->get_statistic('X1_bits'), 7.3 ); 354is( $result->get_statistic('X2'), 0 ); 355is( $result->get_statistic('X2_bits'), '0.0' ); 356is( $result->get_statistic('S1'), 41 ); 357is( $result->get_statistic('S1_bits'), 21.7 ); 358is( $result->get_statistic('S2'), 53 ); 359is( $result->get_statistic('S2_bits'), 27.2 ); 360 361is( $result->get_statistic('decayconst'), 0.1 ); 362 363is( $result->get_parameter('matrix'), 'BLOSUM62' ); 364 365@valid = ( 366 [ 'gb|AE000479.1|AE000479', 10934, 'AE000479', '0.13', 33.6, 67 ], 367 [ 'gb|AE000302.1|AE000302', 10264, 'AE000302', '0.61', 31.3, 62 ], 368 [ 'gb|AE000277.1|AE000277', 11653, 'AE000277', '0.84', 30.8, 61 ] 369); 370$count = 0; 371 372while ( $hit = $result->next_hit ) { 373 my $d = shift @valid; 374 is( $hit->name, shift @$d ); 375 is( $hit->length, shift @$d ); 376 is( $hit->accession, shift @$d ); 377 float_is( $hit->significance, shift @$d ); 378 is( $hit->bits, shift @$d ); 379 is( $hit->raw_score, shift @$d ); 380 381 if ( $count == 0 ) { 382 my $hsps_left = 1; 383 while ( my $hsp = $hit->next_hsp ) { 384 is( $hsp->query->start, 1057 ); 385 is( $hsp->query->end, 1134 ); 386 is( $hsp->query->strand, 1 ); 387 is( $hsp->strand('query'), $hsp->query->strand ); 388 is( $hsp->hit->end, 5893 ); 389 is( $hsp->hit->start, 5816 ); 390 is( $hsp->hit->strand, -1 ); 391 is( $hsp->strand('sbjct'), $hsp->subject->strand ); 392 is( $hsp->length('total'), 26 ); 393 394 float_is( $hsp->evalue, 0.13 ); 395 is( $hsp->score, 67 ); 396 is( $hsp->bits, 33.6 ); 397 is( sprintf( "%.2f", $hsp->percent_identity ), 42.31 ); 398 is( sprintf( "%.4f", $hsp->frac_identical('query') ), '0.4231' ); 399 is( sprintf( "%.4f", $hsp->frac_identical('hit') ), '0.4231' ); 400 is( $hsp->query->frame(), 0 ); 401 is( $hsp->hit->frame(), 1 ); 402 is( $hsp->gaps, 0 ); 403 is( $hsp->query_string, 'SAYWSIFPPLGCWWSTLGPRGSLSPL' ); 404 is( $hsp->hit_string, 'AAVWALFPPVGSQWGCLASQWRTSPL' ); 405 is( $hsp->homology_string, '+A W++FPP+G W L + SPL' ); 406 407 # changed to reflect positional ambiguities, note extra flag 408 is( 409 join( ' ', $hsp->seq_inds( 'query', 'nomatch', 1 ) ), 410 '1063-1065 1090-1095 1099-1104 1108-1113 1117-1125' 411 ); 412 is( 413 join( ' ', $hsp->seq_inds( 'hit', 'nomatch', 1 ) ), 414 '5825-5833 5837-5842 5846-5851 5855-5860 5885-5887' 415 ); 416 is( $hsp->ambiguous_seq_inds, 'query/subject' ); 417 is( $hsp->n, 1 ); 418 $hsps_left--; 419 } 420 is( $hsps_left, 0 ); 421 } 422 last if ( $count++ > @valid ); 423} 424is( @valid, 0 ); 425 426# test for MarkW bug in blastN 427 428$searchio = Bio::SearchIO->new( 429 '-format' => 'blast', 430 '-file' => test_input_file('a_thaliana.blastn') 431); 432 433$result = $searchio->next_result; 434like($result->algorithm_reference,qr/Gapped BLAST and PSI-BLAST/); 435is( $result->rid, '1012577175-3730-28291' ); 436is( $result->database_name, 437'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS, GSS,or phase 0, 1 or 2 HTGS sequences) ' 438); 439is( $result->database_letters, 4677375331 ); 440is( $result->database_entries, 1083200 ); 441is( $result->algorithm, 'BLASTN' ); 442like( $result->algorithm_version, qr/^2\.2\.1/ ); 443is( $result->query_name, '' ); 444is( $result->query_length, 60 ); 445is( $result->get_parameter('gapopen'), 5 ); 446is( $result->get_parameter('gapext'), 2 ); 447is( $result->get_parameter('ktup'), undef ); 448is( $result->get_statistic('querylength'), 41 ); 449is( $result->get_statistic('effectivedblength'), 4656794531 ); 450is( $result->get_statistic('effectivespace'), 190928575771 ); 451is( $result->get_statistic('effectivespaceused'), 190928575771 ); 452 453is( $result->get_statistic('lambda'), 1.37 ); 454is( $result->get_statistic('kappa'), 0.711 ); 455is( $result->get_statistic('entropy'), 1.31 ); 456is( $result->get_statistic('T'), 0 ); 457is( $result->get_statistic('A'), 30 ); 458is( $result->get_statistic('X1'), '6' ); 459is( $result->get_statistic('X1_bits'), 11.9 ); 460is( $result->get_statistic('X2'), 15 ); 461is( $result->get_statistic('X2_bits'), 29.7 ); 462is( $result->get_statistic('S1'), 12 ); 463is( $result->get_statistic('S1_bits'), 24.3 ); 464is( $result->get_statistic('S2'), 17 ); 465is( $result->get_statistic('S2_bits'), 34.2 ); 466 467is( $result->get_statistic('dbentries'), 1083200 ); 468 469@valid = ( 470 [ 'gb|AY052359.1|', 2826, 'AY052359', '3e-18', 95.6, 48, 1, 60, '1.0000' ], 471 [ 472 'gb|AC002329.2|AC002329', 76170, 'AC002329', '3e-18', 95.6, 48, 1, 60, 473 '1.0000' 474 ], 475 [ 476 'gb|AF132318.1|AF132318', 5383, 'AF132318', '0.04', 42.1, 21, 35, 55, 477 '0.3500' 478 ] 479); 480$count = 0; 481 482while ( my $hit = $result->next_hit ) { 483 my $d = shift @valid; 484 is( $hit->name, shift @$d ); 485 is( $hit->length, shift @$d ); 486 is( $hit->accession, shift @$d ); 487 float_is( $hit->significance, shift @$d ); 488 is( $hit->bits, shift @$d ); 489 is( $hit->raw_score, shift @$d ); 490 is( $hit->start, shift @$d ); 491 is( $hit->end, shift @$d ); 492 is( sprintf( "%.4f", $hit->frac_aligned_query ), shift @$d ); 493 494 if ( $count == 0 ) { 495 my $hsps_left = 1; 496 while ( my $hsp = $hit->next_hsp ) { 497 is( $hsp->query->start, 1 ); 498 is( $hsp->query->end, 60 ); 499 is( $hsp->query->strand, 1 ); 500 is( $hsp->hit->start, 154 ); 501 is( $hsp->hit->end, 212 ); 502 is( $hsp->hit->strand, 1 ); 503 is( $hsp->length('total'), 60 ); 504 float_is( $hsp->evalue, 3e-18 ); 505 is( $hsp->score, 48 ); 506 is( $hsp->bits, 95.6 ); 507 is( sprintf( "%.2f", $hsp->percent_identity ), 96.67 ); 508 is( sprintf( "%.4f", $hsp->frac_identical('query') ), 0.9667 ); 509 is( sprintf( "%.4f", $hsp->frac_identical('hit') ), 0.9831 ); 510 is( $hsp->query->frame(), 0 ); 511 is( $hsp->hit->frame(), 0 ); 512 is( $hsp->query->seq_id, undef ); 513 is( $hsp->hit->seq_id, 'gb|AY052359.1|' ); 514 is( $hsp->gaps('query'), 0 ); 515 is( $hsp->gaps('hit'), 1 ); 516 is( $hsp->gaps, 1 ); 517 is( $hsp->query_string, 518 'aggaatgctgtttaattggaatcgtacaatggagaatttgacggaaatagaatcaacgat' 519 ); 520 is( $hsp->hit_string, 521 'aggaatgctgtttaattggaatca-acaatggagaatttgacggaaatagaatcaacgat' 522 ); 523 is( $hsp->homology_string, 524 '||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||' 525 ); 526 my $aln = $hsp->get_aln; 527 is( sprintf( "%.2f", $aln->overall_percentage_identity ), 96.67 ); 528 is( sprintf( "%.2f", $aln->percentage_identity ), 98.31 ); 529 is( $hsp->n, 1 ); 530 $hsps_left--; 531 } 532 is( $hsps_left, 0 ); 533 } 534 last if ( $count++ > @valid ); 535} 536is( @valid, 0 ); 537 538#WU-BlastX test 539 540$searchio = Bio::SearchIO->new( 541 '-format' => 'blast', 542 '-file' => test_input_file('dnaEbsub_ecoli.wublastx') 543); 544 545$result = $searchio->next_result; 546is( 547 $result->algorithm_reference, 'Gish, W. (1996-2000) http://blast.wustl.edu 548Gish, Warren and David J. States (1993). Identification of protein coding 549regions by database similarity search. Nat. Genet. 3:266-72. 550' 551); 552is( $result->database_name, 'ecoli.aa' ); 553is( $result->database_letters, 1358990 ); 554is( $result->database_entries, 4289 ); 555is( $result->algorithm, 'BLASTX' ); 556like( $result->algorithm_version, qr/^2\.0MP\-WashU/ ); 557is( $result->query_name, 'gi|142864|gb|M10040.1|BACDNAE' ); 558is( $result->query_description, 559 'B.subtilis dnaE gene encoding DNA primase, complete cds' ); 560is( $result->query_accession, 'M10040.1' ); 561is( $result->query_gi, 142864 ); 562is( $result->query_length, 2001 ); 563is( $result->get_parameter('matrix'), 'blosum62' ); 564 565is( $result->get_statistic('lambda'), 0.318 ); 566is( $result->get_statistic('kappa'), 0.135 ); 567is( $result->get_statistic('entropy'), 0.401 ); 568 569is( $result->get_statistic('dbentries'), 4289 ); 570 571@valid = ( [ 'gi|1789447|gb|AAC76102.1|', 581, 'AAC76102', '1.1e-74', 671 ] ); 572$count = 0; 573 574while ( my $hit = $result->next_hit ) { 575 my $d = shift @valid; 576 is( $hit->name, shift @$d ); 577 is( $hit->length, shift @$d ); 578 is( $hit->accession, shift @$d ); 579 float_is( $hit->significance, shift @$d ); 580 is( $hit->raw_score, shift @$d ); 581 is( sprintf( "%.4f", $hit->frac_identical('query') ), '0.3640' ); 582 is( sprintf( "%.4f", $hit->frac_identical('hit') ), '0.3660' ); 583 is( sprintf( "%.4f", $hit->frac_conserved('query') ), '0.5370' ); 584 is( sprintf( "%.4f", $hit->frac_conserved('hit') ), '0.5400' ); 585 is( sprintf( "%.4f", $hit->frac_aligned_query ), '0.6200' ); 586 is( sprintf( "%.4f", $hit->frac_aligned_hit ), '0.7100' ); 587 588 if ( $count == 0 ) { 589 my $hsps_left = 1; 590 while ( my $hsp = $hit->next_hsp ) { 591 is( $hsp->query->start, 21 ); 592 is( $hsp->query->end, 1265 ); 593 is( $hsp->query->strand, 1 ); 594 is( $hsp->hit->start, 1 ); 595 is( $hsp->hit->end, 413 ); 596 is( $hsp->hit->strand, 0 ); 597 is( $hsp->length('total'), 421 ); 598 float_is( $hsp->evalue, 1.1e-74 ); 599 float_is( $hsp->pvalue, '1.1e-74' ); 600 is( $hsp->score, 671 ); 601 is( $hsp->bits, 265.8 ); 602 is( sprintf( "%.2f", $hsp->percent_identity ), 35.87 ); 603 604 is( sprintf( "%.4f", $hsp->frac_identical('query') ), 0.3639 ); 605 is( sprintf( "%.4f", $hsp->frac_identical('hit') ), 0.3656 ); 606 is( sprintf( "%.4f", $hsp->frac_conserved('query') ), 0.5373 ); 607 is( sprintf( "%.2f", $hsp->frac_conserved('hit') ), 0.54 ); 608 609 is( sprintf( "%.4f", $hsp->frac_identical('hsp') ), 0.3587 ); 610 is( sprintf( "%.4f", $hsp->frac_conserved('hsp') ), 0.5297 ); 611 612 is( $hsp->query->frame(), 2 ); 613 is( $hsp->hit->frame(), 0 ); 614 is( $hsp->gaps('query'), 6 ); 615 is( $hsp->gaps('hit'), 8 ); 616 is( $hsp->gaps, 14 ); 617 is( $hsp->query_string, 618'MGNRIPDEIVDQVQKSADIVEVIGDYVQLKKQGRNYFGLCPFHGESTPSFSVSPDKQIFHCFGCGAGGNVFSFLRQMEGYSFAESVSHLADKYQIDFPDDITVHSGARP---ESSGEQKMAEAHELLKKFYHHLLINTKEGQEALDYLLSRGFTKELINEFQIGYALDSWDFITKFLVKRGFSEAQMEKAGLLIRREDGSGYFDRFRNRVMFPIHDHHGAVVAFSGRALGSQQPKYMNSPETPLFHKSKLLYNFYKARLHIRKQERAVLFEGFADVYTAVSSDVKESIATMGTSLTDDHVKILRRNVEEIILCYDSDKAGYEATLKASELL---QKKGCKVRVAMIPDGLDPDDYIKKFGGEKFKNDIIDASVTVMAFKMQYFRKGKNLSDEGDRLAYIKDVLKEISTLSGSLEQEVYVKQ' 619 ); 620 is( $hsp->hit_string, 621'MAGRIPRVFINDLLARTDIVDLIDARVKLKKQGKNFHACCPFHNEKTPSFTVNGEKQFYHCFGCGAHGNAIDFLMNYDKLEFVETVEELAAMHNLEVPFE----AGSGPSQIERHQRQTLYQLMDGLNTFYQQSL-QQPVATSARQYLEKRGLSHEVIARFAIGFAPPGWDNVLKRFGGNPENRQSLIDAGMLVTNDQGRSY-DRFRERVMFPIRDKRGRVIGFGGRVLGNDTPKYLNSPETDIFHKGRQLYGLYEAQQDNAEPNRLLVVEGYMDVVALAQYGINYAVASLGTSTTADHIQLLFRATNNVICCYDGDRAGRDAAWRALETALPYMTDGRQLRFMFLPDGEDPDTLVRKEGKEAFEARM-EQAMPLSAFLFNSLMPQVDLSTPDGRARLSTLALPLISQVPGETLR-IYLRQ' 622 ); 623 is( $hsp->homology_string, 624'M RIP ++ + DIV++I V+LKKQG+N+ CPFH E TPSF+V+ +KQ +HCFGCGA GN FL + F E+V LA + ++ P + +G+ P E Q + + + L FY L A YL RG + E+I F IG+A WD + K + + AG+L+ + G Y DRFR RVMFPI D G V+ F GR LG+ PKY+NSPET +FHK + LY Y+A+ + R ++ EG+ DV + ++A++GTS T DH+++L R +I CYD D+AG +A +A E G ++R +PDG DPD ++K G E F+ + + ++ + AF +LS R L IS + G + +Y++Q' 625 ); 626 is( 627 join( ' ', $hsp->seq_inds( 'query', 'nomatch', 1 ) ), 628'24-29 39-47 54-56 60-71 90-98 129-137 150-152 156-158 180-182 192-194 219-221 228-236 243-251 255-263 267-269 279-284 291-296 300-302 309-311 315-317 321-332 342-344 351-362 366-368 372-374 378-383 387-389 393-398 405-413 417-440 444-449 456-461 468-470 474-476 486-491 495-497 510-518 525-527 531-533 537-557 561-569 573-578 594-599 603-605 609-614 618-620 633-635 654-656 660-665 669-671 678-680 684-686 693-695 705-710 738-740 753-755 759-761 768-773 786-797 801-806 810-812 819-821 831-833 840-860 864-869 894-896 900-902 921-923 927-938 945-947 957-959 972-974 981-986 993-995 999-1013 1017-1019 1029-1037 1050-1052 1062-1067 1077-1079 1083-1085 1089-1091 1098-1103 1107-1109 1113-1115 1122-1124 1128-1130 1137-1163 1173-1184 1188-1208 1212-1217 1224-1226 1230-1232 1236-1244 1248-1250' 629 ); 630 is( 631 join( ' ', $hsp->seq_inds( 'query', 'mismatch', 1 ) ), 632'24-29 39-47 54-56 60-71 90-98 129-137 150-152 156-158 180-182 192-194 219-221 228-236 243-251 255-263 267-269 279-284 291-296 300-302 309-311 315-317 342-344 351-362 366-368 372-374 378-383 387-389 393-398 405-413 420-440 444-449 456-461 468-470 474-476 486-491 495-497 510-518 525-527 531-533 537-557 561-569 573-578 594-599 603-605 609-614 633-635 654-656 660-665 669-671 678-680 684-686 693-695 705-710 738-740 753-755 759-761 768-773 786-797 801-806 810-812 819-821 831-833 840-860 864-869 894-896 900-902 921-923 927-938 945-947 957-959 972-974 981-986 993-995 999-1013 1017-1019 1029-1037 1050-1052 1062-1067 1077-1079 1083-1085 1089-1091 1098-1103 1113-1115 1122-1124 1128-1130 1137-1163 1173-1184 1188-1208 1212-1217 1224-1226 1230-1232 1236-1244' 633 ); 634 is( 635 join( ' ', $hsp->seq_inds( 'hit', 'nomatch', 1 ) ), 636'2 3 7-9 12 14-17 24-26 37-39 44 46 54 58 67 70-72 75-77 79-81 83 87 88 91 92 94 97 99 104 106-108 110-113 115 117 119 120 122 124 125 128-130 132-138 140 141 144 145 148 150 154 155 157 162-164 167 169 171-177 179-181 183 184 190 191 193 195 196 202 209 211 212 214 217 219 222 226 227 237 242 244 247 248 253-256 258 259 261 264 268 271-277 279 280 289 291 298 300-303 306 310 315 318 319 322 324-331 333 337-339 344 348 349 353 355 357 360 361 364 367 369 372-380 384-387 389-395 397 398 401 403 405-407' 637 ); 638 is( 639 join( ' ', $hsp->seq_inds( 'hit', 'mismatch', 1 ) ), 640'2 3 7-9 12 14-17 24-26 37-39 44 46 54 58 67 70-72 75-77 79-81 83 87 88 91 92 94 97 99 104 110-113 115 117 119 120 122 124 125 128-130 132-138 140 141 144 145 148 150 154 155 157 162-164 167 169 171-177 179-181 183 184 190 191 193 195 196 202 209 211 212 214 217 219 222 226 227 237 242 244 247 248 253-256 258 259 261 264 268 271-277 279 280 289 291 298 300-303 306 310 315 318 319 322 324 325 329-331 333 337-339 344 348 349 353 355 357 360 361 364 367 369 372-380 384-387 389-395 397 398 401 403 405-407' 641 ); 642 is( join( ' ', $hsp->seq_inds( 'query', 'gaps', 1 ) ), '347 1004' ); 643 is( join( ' ', $hsp->seq_inds( 'hit', 'gaps', 1 ) ), 644 '100 131 197 362 408' ); 645 is( $hsp->ambiguous_seq_inds, 'query' ); 646 is( $hsp->n, 1 ); 647 $hsps_left--; 648 } 649 is( $hsps_left, 0 ); 650 } 651 last if ( $count++ > @valid ); 652} 653is( @valid, 0 ); 654 655#Trickier WU-Blast 656$searchio = Bio::SearchIO->new( 657 '-format' => 'blast', 658 '-file' => test_input_file('tricky.wublast') 659); 660$result = $searchio->next_result; 661my $hits_left = 1; 662while ( my $hit = $result->next_hit ) { 663 664# frac_aligned_hit used to be over 1, frac_identical & frac_conserved are still too wrong 665 TODO: { 666 local $TODO = 'frac_identical & frac_conserved are still too wrong'; 667 cmp_ok sprintf( "%.3f", $hit->frac_identical ), '>', 0.9; 668 cmp_ok sprintf( "%.3f", $hit->frac_conserved ), '<=', 1; 669 } 670 is( sprintf( "%.2f", $hit->frac_aligned_query ), '0.92' ); 671 is( sprintf( "%.2f", $hit->frac_aligned_hit ), '0.91' ); 672 $hits_left--; 673} 674is( $hits_left, 0 ); 675 676# More frac_ method testing, this time on ncbi blastn 677$searchio = Bio::SearchIO->new( 678 '-format' => 'blast', 679 '-file' => test_input_file('frac_problems.blast') 680); 681my @expected = ( "1.000", "0.943" ); 682while ( my $result = $searchio->next_result ) { 683 my $hit = $result->next_hit; 684 is( $hit->frac_identical, shift @expected ); 685} 686is( @expected, 0 ); 687 688# And even more: frac_aligned_query should never be over 1! 689$searchio = Bio::SearchIO->new( 690 '-format' => 'blast', 691 '-file' => test_input_file('frac_problems2.blast') 692); 693$result = $searchio->next_result; 694$hit = $result->next_hit; 695is $hit->frac_aligned_query, 0.97; 696 697# Also, start and end should be sane 698$searchio = Bio::SearchIO->new( 699 '-format' => 'blast', 700 '-file' => test_input_file('frac_problems3.blast') 701); 702$result = $searchio->next_result; 703$hit = $result->next_hit; 704is $hit->start('sbjct'), 207; 705is $hit->end('sbjct'), 1051; 706 707#WU-TBlastN test 708 709$searchio = Bio::SearchIO->new( 710 '-format' => 'blast', 711 '-file' => test_input_file('dnaEbsub_ecoli.wutblastn') 712); 713 714$result = $searchio->next_result; 715is( 716 $result->algorithm_reference, 'Gish, W. (1996-2000) http://blast.wustl.edu 717' 718); 719is( $result->database_name, 'ecoli.nt' ); 720is( $result->database_letters, 4662239 ); 721is( $result->database_entries, 400 ); 722is( $result->algorithm, 'TBLASTN' ); 723like( $result->algorithm_version, qr/^2\.0MP\-WashU/ ); 724is( $result->query_name, 'gi|142865|gb|AAA22406.1|' ); 725is( $result->query_description, 'DNA primase' ); 726is( $result->query_accession, 'AAA22406.1' ); 727is( $result->query_gi, 142865 ); 728is( $result->query_length, 603 ); 729is( $result->get_parameter('matrix'), 'blosum62' ); 730 731is( $result->get_statistic('lambda'), '0.320' ); 732is( $result->get_statistic('kappa'), 0.136 ); 733is( $result->get_statistic('entropy'), 0.387 ); 734 735is( $result->get_statistic('dbentries'), 400 ); 736 737@valid = 738 ( [ 'gi|1789441|gb|AE000388.1|AE000388', 10334, 'AE000388', '1.4e-73', 671 ] 739 ); 740$count = 0; 741 742while ( my $hit = $result->next_hit ) { 743 my $d = shift @valid; 744 is( $hit->name, shift @$d ); 745 is( $hit->length, shift @$d ); 746 is( $hit->accession, shift @$d ); 747 float_is( $hit->significance, shift @$d ); 748 is( $hit->raw_score, shift @$d ); 749 750 if ( $count == 0 ) { 751 my $hsps_left = 1; 752 while ( my $hsp = $hit->next_hsp ) { 753 is( $hsp->query->start, 1 ); 754 is( $hsp->query->end, 415 ); 755 is( $hsp->query->strand, 0 ); 756 is( $hsp->hit->start, 4778 ); 757 is( $hsp->hit->end, 6016 ); 758 is( $hsp->hit->strand, 1 ); 759 is( $hsp->length('total'), 421 ); 760 float_is( $hsp->evalue, 1.4e-73 ); 761 float_is( $hsp->pvalue, 1.4e-73 ); 762 is( $hsp->score, 671 ); 763 is( $hsp->bits, 265.8 ); 764 is( sprintf( "%.2f", $hsp->percent_identity ), 35.87 ); 765 is( sprintf( "%.4f", $hsp->frac_identical('hit') ), 0.3656 ); 766 is( sprintf( "%.4f", $hsp->frac_identical('query') ), 0.3639 ); 767 is( sprintf( "%.4f", $hsp->frac_conserved('hsp') ), 0.5297 ); 768 is( $hsp->query->frame(), 0 ); 769 is( $hsp->hit->frame(), 1 ); 770 is( $hsp->gaps('query'), 6 ); 771 is( $hsp->gaps('hit'), 8 ); 772 is( $hsp->gaps, 14 ); 773 is( $hsp->query_string, 774'MGNRIPDEIVDQVQKSADIVEVIGDYVQLKKQGRNYFGLCPFHGESTPSFSVSPDKQIFHCFGCGAGGNVFSFLRQMEGYSFAESVSHLADKYQIDFPDDITVHSGARP---ESSGEQKMAEAHELLKKFYHHLLINTKEGQEALDYLLSRGFTKELINEFQIGYALDSWDFITKFLVKRGFSEAQMEKAGLLIRREDGSGYFDRFRNRVMFPIHDHHGAVVAFSGRALGSQQPKYMNSPETPLFHKSKLLYNFYKARLHIRKQERAVLFEGFADVYTAVSSDVKESIATMGTSLTDDHVKILRRNVEEIILCYDSDKAGYEATLKASELL---QKKGCKVRVAMIPDGLDPDDYIKKFGGEKFKNDIIDASVTVMAFKMQYFRKGKNLSDEGDRLAYIKDVLKEISTLSGSLEQEVYVKQ' 775 ); 776 is( $hsp->hit_string, 777'MAGRIPRVFINDLLARTDIVDLIDARVKLKKQGKNFHACCPFHNEKTPSFTVNGEKQFYHCFGCGAHGNAIDFLMNYDKLEFVETVEELAAMHNLEVPFE----AGSGPSQIERHQRQTLYQLMDGLNTFYQQSL-QQPVATSARQYLEKRGLSHEVIARFAIGFAPPGWDNVLKRFGGNPENRQSLIDAGMLVTNDQGRSY-DRFRERVMFPIRDKRGRVIGFGGRVLGNDTPKYLNSPETDIFHKGRQLYGLYEAQQDNAEPNRLLVVEGYMDVVALAQYGINYAVASLGTSTTADHIQLLFRATNNVICCYDGDRAGRDAAWRALETALPYMTDGRQLRFMFLPDGEDPDTLVRKEGKEAFEARM-EQAMPLSAFLFNSLMPQVDLSTPDGRARLSTLALPLISQVPGETLR-IYLRQ' 778 ); 779 is( $hsp->homology_string, 780'M RIP ++ + DIV++I V+LKKQG+N+ CPFH E TPSF+V+ +KQ +HCFGCGA GN FL + F E+V LA + ++ P + +G+ P E Q + + + L FY L A YL RG + E+I F IG+A WD + K + + AG+L+ + G Y DRFR RVMFPI D G V+ F GR LG+ PKY+NSPET +FHK + LY Y+A+ + R ++ EG+ DV + ++A++GTS T DH+++L R +I CYD D+AG +A +A E G ++R +PDG DPD ++K G E F+ + + ++ + AF +LS R L IS + G + +Y++Q' 781 ); 782 is( 783 join( ' ', $hsp->seq_inds( 'query', 'nomatch', 1 ) ), 784'2 3 7-9 12 14-17 24-26 37-39 44 46 54 58 67 70-72 75-77 79-81 83 87 88 91 92 94 97 99 101-104 108 111-114 116 118 120 121 123 125 126 129-131 133-140 142 143 146 147 150 152 156 157 159 164-166 169 171 173-179 181-183 185 186 192 193 195 197 198 200 205 212 214 215 217 220 222 225 229 230 240 245 247 250 251 256-259 261 262 264 267 271 274-280 282 283 292 294 301 303-306 309 313 318 321 322 325 327-331 333 337-339 344 348 349 353 355 357 360 361 363 365 368 370 373-381 385-388 390-396 398 399 402 404 406-408 410' 785 ); 786 is( 787 join( ' ', $hsp->seq_inds( 'hit', 'nomatch', 1 ) ), 788'4781-4786 4796-4804 4811-4813 4817-4828 4847-4855 4886-4894 4907-4909 4913-4915 4937-4939 4949-4951 4976-4978 4985-4993 5000-5008 5012-5020 5024-5026 5036-5041 5048-5053 5057-5059 5066-5068 5072-5074 5087-5089 5093-5101 5105-5116 5120-5122 5126-5128 5132-5137 5141-5143 5147-5152 5159-5167 5171-5191 5195-5200 5207-5212 5219-5221 5225-5227 5237-5242 5246-5248 5261-5269 5276-5278 5282-5284 5288-5308 5312-5320 5324-5329 5345-5350 5354-5356 5360-5365 5381-5383 5402-5404 5408-5413 5417-5419 5426-5428 5432-5434 5441-5443 5453-5458 5486-5488 5501-5503 5507-5509 5516-5521 5534-5545 5549-5554 5558-5560 5567-5569 5579-5581 5588-5608 5612-5617 5642-5644 5648-5650 5669-5671 5675-5686 5693-5695 5705-5707 5720-5722 5729-5734 5741-5743 5747-5770 5774-5776 5786-5794 5807-5809 5819-5824 5834-5836 5840-5842 5846-5848 5855-5860 5867-5869 5876-5878 5882-5884 5891-5917 5927-5938 5942-5962 5966-5971 5978-5980 5984-5986 5990-5998' 789 ); 790 is( join( ' ', $hsp->seq_inds( 'query', 'gaps', 1 ) ), '109 328' ); 791 is( join( ' ', $hsp->seq_inds( 'hit', 'gaps', 1 ) ), 792 '5077 5170 5368 5863 6001' ); 793 is( $hsp->ambiguous_seq_inds, 'subject' ); 794 is( $hsp->n, 1 ); 795 $hsps_left--; 796 } 797 is( $hsps_left, 0 ); 798 } 799 last if ( $count++ > @valid ); 800} 801is( $count, 1 ); 802 803# WU-BLAST TBLASTX 804$searchio = Bio::SearchIO->new( 805 '-format' => 'blast', 806 '-file' => test_input_file('dnaEbsub_ecoli.wutblastx') 807); 808 809$result = $searchio->next_result; 810is( 811 $result->algorithm_reference, 'Gish, W. (1996-2000) http://blast.wustl.edu 812' 813); 814is( $result->database_name, 'ecoli.nt' ); 815is( $result->database_letters, 4662239 ); 816is( $result->database_entries, 400 ); 817is( $result->algorithm, 'TBLASTX' ); 818like( $result->algorithm_version, qr/^2\.0MP\-WashU/ ); 819is( $result->query_name, 'gi|142864|gb|M10040.1|BACDNAE' ); 820is( $result->query_description, 821 'B.subtilis dnaE gene encoding DNA primase, complete cds' ); 822is( $result->query_accession, 'M10040.1' ); 823is( $result->query_gi, 142864 ); 824is( $result->query_length, 2001 ); 825is( $result->get_parameter('matrix'), 'blosum62' ); 826 827is( $result->get_statistic('lambda'), 0.318 ); 828is( $result->get_statistic('kappa'), 0.135 ); 829is( $result->get_statistic('entropy'), 0.401 ); 830is( $result->get_statistic('dbentries'), 400 ); 831 832@valid = ( 833 [ 834 'gi|1789441|gb|AE000388.1|AE000388', 835 10334, 'AE000388', '6.4e-70', 318, 148.6 836 ], 837 [ 'gi|2367383|gb|AE000509.1|AE000509', 10589, 'AE000509', 1, 59, 29.9 ] 838); 839$count = 0; 840 841while ( my $hit = $result->next_hit ) { 842 my $d = shift @valid; 843 is( $hit->name, shift @$d ); 844 is( $hit->length, shift @$d ); 845 is( $hit->accession, shift @$d ); 846 847 # using e here to deal with 0.9992 coming out right here as well 848 float_is( $hit->significance, shift @$d ); 849 is( $hit->raw_score, shift @$d ); 850 is( $hit->bits, shift @$d ); 851 if ( $count == 0 ) { 852 my $hspcounter = 0; 853 while ( my $hsp = $hit->next_hsp ) { 854 $hspcounter++; 855 if ( $hspcounter == 3 ) { 856 857 # let's actually look at the 3rd HSP 858 is( $hsp->query->start, 441 ); 859 is( $hsp->query->end, 617 ); 860 is( $hsp->query->strand, 1 ); 861 is( $hsp->hit->start, 5192 ); 862 is( $hsp->hit->end, 5368 ); 863 is( $hsp->hit->strand, 1 ); 864 is( $hsp->length('total'), 59 ); 865 float_is( $hsp->evalue, 6.4e-70 ); 866 float_is( $hsp->pvalue, 6.4e-70 ); 867 is( $hsp->score, 85 ); 868 is( $hsp->bits, 41.8 ); 869 is( sprintf( "%.2f", $hsp->percent_identity ), '32.20' ); 870 is( sprintf( "%.3f", $hsp->frac_identical('hit') ), 0.322 ); 871 is( sprintf( "%.3f", $hsp->frac_identical('query') ), 0.322 ); 872 is( sprintf( "%.4f", $hsp->frac_conserved('hsp') ), 0.4746 ); 873 is( $hsp->query->frame(), 2 ); 874 is( $hsp->hit->frame(), 1 ); 875 is( $hsp->gaps('query'), 0 ); 876 is( $hsp->gaps('hit'), 0 ); 877 is( $hsp->gaps, 0 ); 878 is( $hsp->n, 1 ); 879 is( $hsp->query_string, 880'ALDYLLSRGFTKELINEFQIGYALDSWDFITKFLVKRGFSEAQMEKAGLLIRREDGSGY' 881 ); 882 is( $hsp->hit_string, 883'ARQYLEKRGLSHEVIARFAIGFAPPGWDNVLKRFGGNPENRQSLIDAGMLVTNDQGRSY' 884 ); 885 is( $hsp->homology_string, 886'A YL RG + E+I F IG+A WD + K + + AG+L+ + G Y' 887 ); 888 is( 889 join( ' ', $hsp->seq_inds( 'query', 'nomatch', 1 ) ), 890'444-449 456-461 468-470 474-476 486-491 495-497 510-518 525-527 531-533 537-557 561-569 573-578 594-599 603-605 609-614' 891 ); 892 is( 893 join( ' ', $hsp->seq_inds( 'hit', 'nomatch', 1 ) ), 894'5195-5200 5207-5212 5219-5221 5225-5227 5237-5242 5246-5248 5261-5269 5276-5278 5282-5284 5288-5308 5312-5320 5324-5329 5345-5350 5354-5356 5360-5365' 895 ); 896 is( $hsp->ambiguous_seq_inds, 'query/subject' ); 897 last; 898 } 899 } 900 is( $hspcounter, 3 ); 901 } 902 elsif ( $count == 1 ) { 903 my $hsps_to_do = 1; 904 while ( my $hsp = $hit->next_hsp ) { 905 is( $hsp->query->start, 587 ); 906 is( $hsp->query->end, 706 ); 907 is( $hsp->query->strand, -1 ); 908 is( $hsp->hit->start, 4108 ); 909 is( $hsp->hit->end, 4227 ); 910 is( $hsp->hit->strand, -1 ); 911 is( $hsp->length('total'), 40 ); 912 float_is( $hsp->evalue, 7.1 ); 913 float_is( $hsp->pvalue, '1.00' ); 914 is( $hsp->score, 59 ); 915 is( $hsp->bits, 29.9 ); 916 is( sprintf( "%.2f", $hsp->percent_identity ), '37.50' ); 917 is( sprintf( "%.4f", $hsp->frac_identical('hit') ), '0.3750' ); 918 is( sprintf( "%.4f", $hsp->frac_identical('query') ), '0.3750' ); 919 is( sprintf( "%.4f", $hsp->frac_conserved('hsp') ), '0.4750' ); 920 is( $hsp->query->frame(), 2 ); 921 is( $hsp->hit->frame(), 2 ); 922 is( $hsp->gaps('query'), 0 ); 923 is( $hsp->gaps('hit'), 0 ); 924 is( $hsp->gaps, 0 ); 925 is( $hsp->n, 1 ); 926 is( $hsp->query_string, 927 'WLPRALPEKATTAP**SWIGNMTRFLKRSKYPLPSSRLIR' ); 928 is( $hsp->hit_string, 'WLSRTTVGSSTVSPRTFWITRMKVKLSSSKVTLPSTKSTR' ); 929 is( $hsp->homology_string, 930 'WL R +T +P WI M L SK LPS++ R' ); 931 $hsps_to_do--; 932 last; 933 } 934 is( $hsps_to_do, 0 ); 935 } 936 last if ( $count++ > @valid ); 937} 938is( $count, 2 ); 939 940# WU-BLAST -echofilter option test (Bug 2388) 941$searchio = Bio::SearchIO->new( 942 '-format' => 'blast', 943 '-file' => test_input_file('echofilter.wublastn') 944); 945 946$result = $searchio->next_result; 947is( 948 $result->algorithm_reference, 'Gish, W. (1996-2006) http://blast.wustl.edu 949' 950); 951is( $result->database_name, 'NM_003201.fa' ); 952is( $result->database_letters, 1936 ); 953is( $result->database_entries, 1 ); 954is( $result->algorithm, 'BLASTN' ); 955like( $result->algorithm_version, qr/^2\.0MP\-WashU/ ); 956like( $result->query_name, 957qr/ref|NM_003201.1| Homo sapiens transcription factor A, mitochondrial \(TFAM\), mRNA/ 958); 959is( $result->query_accession, 'NM_003201.1' ); 960 961is( $result->query_length, 1936 ); 962is( $result->get_statistic('lambda'), 0.192 ); 963is( $result->get_statistic('kappa'), 0.182 ); 964is( $result->get_statistic('entropy'), 0.357 ); 965is( $result->get_statistic('dbletters'), 1936 ); 966is( $result->get_statistic('dbentries'), 1 ); 967is( $result->get_parameter('matrix'), '+5,-4' ); 968 969@valid = ( [ 'ref|NM_003201.1|', 1936, 'NM_003201', '0', 9680 ], ); 970$count = 0; 971while ( $hit = $result->next_hit ) { 972 my $d = shift @valid; 973 974 is( $hit->name, shift @$d ); 975 is( $hit->length, shift @$d ); 976 is( $hit->accession, shift @$d ); 977 float_is( $hit->significance, shift @$d ); 978 is( $hit->raw_score, shift @$d ); 979 980 if ( $count == 0 ) { 981 my $hsps_left = 1; 982 while ( my $hsp = $hit->next_hsp ) { 983 is( $hsp->query->start, 1 ); 984 is( $hsp->query->end, 1936 ); 985 is( $hsp->hit->start, 1 ); 986 is( $hsp->hit->end, 1936 ); 987 is( $hsp->length('total'), 1936 ); 988 989 float_is( $hsp->evalue, 0. ); 990 float_is( $hsp->pvalue, '0.' ); 991 is( $hsp->score, 9680 ); 992 is( $hsp->bits, 1458.4 ); 993 is( $hsp->percent_identity, 100 ); 994 is( $hsp->frac_identical('query'), 1.00 ); 995 is( $hsp->frac_identical('hit'), 1.00 ); 996 is( $hsp->gaps, 0 ); 997 is( $hsp->n, 1 ); 998 $hsps_left--; 999 } 1000 is( $hsps_left, 0 ); 1001 } 1002 last if ( $count++ > @valid ); 1003} 1004is( @valid, 0 ); 1005 1006# Do a multiblast report test 1007$searchio = Bio::SearchIO->new( 1008 '-format' => 'blast', 1009 '-file' => test_input_file('multi_blast.bls') 1010); 1011 1012@expected = qw(CATH_RAT CATL_HUMAN CATL_RAT PAPA_CARPA); 1013my $results_left = 4; 1014while ( my $result = $searchio->next_result ) { 1015 like($result->algorithm_reference, qr/Gapped BLAST and PSI-BLAST/); 1016 is( $result->query_name, shift @expected, "Multiblast query test" ); 1017 $results_left--; 1018} 1019is( $results_left, 0 ); 1020 1021# Test GCGBlast parsing 1022 1023$searchio = Bio::SearchIO->new( 1024 '-format' => 'blast', 1025 '-file' => test_input_file('test.gcgblast') 1026); 1027$result = $searchio->next_result(); 1028like($result->algorithm_reference,qr/Gapped BLAST and PSI-BLAST/); 1029is( $result->query_name, '/v0/people/staji002/test.gcg' ); 1030is( $result->algorithm, 'BLASTP' ); 1031is( $result->algorithm_version, '2.2.1 [Apr-13-2001]' ); 1032is( $result->database_name, 'pir' ); 1033is( $result->database_entries, 274514 ); 1034is( $result->database_letters, 93460074 ); 1035is( $result->get_statistic('querylength'), 44 ); 1036is( $result->get_statistic('effectivedblength'), 65459646 ); 1037is( $result->get_statistic('effectivespace'), 2880224424 ); 1038is( $result->get_statistic('effectivespaceused'), 2880224424 ); 1039 1040$hit = $result->next_hit; 1041is( $hit->description, 'F22B7.10 protein - Caenorhabditis elegans' ); 1042is( $hit->name, 'PIR2:S44629' ); 1043is( $hit->length, 628 ); 1044is( $hit->accession, 'PIR2:S44629' ); 1045float_is( $hit->significance, 2e-08 ); 1046is( $hit->raw_score, 136 ); 1047is( $hit->bits, '57.0' ); 1048$hsp = $hit->next_hsp; 1049float_is( $hsp->evalue, 2e-08 ); 1050is( $hsp->bits, '57.0' ); 1051is( $hsp->score, 136 ); 1052is( int( $hsp->percent_identity ), 28 ); 1053is( sprintf( "%.2f", $hsp->frac_identical('query') ), 0.29 ); 1054is( $hsp->frac_conserved('total'), 69 / 135 ); 1055is( $hsp->gaps('total'), 8 ); 1056is( $hsp->gaps('hit'), 6 ); 1057is( $hsp->gaps('query'), 2 ); 1058 1059is( $hsp->hit->start, 342 ); 1060is( $hsp->hit->end, 470 ); 1061is( $hsp->query->start, 3 ); 1062is( $hsp->query->end, 135 ); 1063 1064is( $hsp->query_string, 1065'CAAEFDFMEKETPLRYTKTXXXXXXXXXXXXXXRKIISDMWGVLAKQQTHVRKHQFDHGELVYHALQLLAYTALGILIMRLKLFLTPYMCVMASLICSRQLFGW--LFCKVHPGAIVFVILAAMSIQGSANLQTQ' 1066); 1067is( $hsp->hit_string, 1068'CSAEFDFIQYSTIEKLCGTLLIPLALISLVTFVFNFVKNT-NLLWRNSEEIG----ENGEILYNVVQLCCSTVMAFLIMRLKLFMTPHLCIVAALFANSKLLGGDRISKTIRVSALVGVI-AILFYRGIPNIRQQ' 1069); 1070is( $hsp->homology_string, 1071'C+AEFDF++ T + T + + +L + + ++GE++Y+ +QL T + LIMRLKLF+TP++C++A+L + +L G + + A+V VI A + +G N++ Q' 1072); 1073 1074#test all the database accession number formats 1075$searchio = Bio::SearchIO->new( 1076 -format => 'blast', 1077 -file => test_input_file('testdbaccnums.out') 1078); 1079$result = $searchio->next_result; 1080like($result->algorithm_reference,qr/Gapped BLAST and PSI-BLAST/); 1081is( $result->rid, '1036160600-011802-21377' ); 1082is( $result->get_statistic('querylength'), 9 ); 1083is( $result->get_statistic('effectivedblength'), 35444647 ); 1084is( $result->get_statistic('effectivespace'), 319001823 ); 1085is( $result->get_statistic('effectivespaceused'), 319001823 ); 1086 1087@valid = ( 1088 [ 'pir||T14789', 'T14789', 'T14789', 'CAB53709', 'AAH01726' ], 1089 [ 'gb|NP_065733.1|CYT19', 'NP_065733', 'CYT19' ], 1090 [ 'emb|XP_053690.4|Cyt19', 'XP_053690' ], 1091 [ 'dbj|NP_056277.2|DKFZP586L0724', 'NP_056277' ], 1092 [ 'prf||XP_064862.2', 'XP_064862' ], 1093 [ 'pdb|BAB13968.1|1', 'BAB13968' ], 1094 [ 'sp|Q16478|GLK5_HUMAN', 'Q16478' ], 1095 [ 'pat|US|NP_002079.2', 'NP_002079' ], 1096 [ 'bbs|NP_079463.2|', 'NP_079463' ], 1097 [ 'gnl|db1|NP_002444.1', 'NP_002444' ], 1098 [ 'ref|XP_051877.1|', 'XP_051877' ], 1099 [ 'lcl|AAH16829.1|', 'AAH16829' ], 1100 [ 'gi|1|gb|NP_065733.1|CYT19', 'NP_065733' ], 1101 [ 'gi|2|emb|XP_053690.4|Cyt19', 'XP_053690' ], 1102 [ 'gi|3|dbj|NP_056277.2|DKFZP586L0724', 'NP_056277' ], 1103 [ 'gi|4|pir||T14789', 'T14789' ], 1104 [ 'gi|5|prf||XP_064862.2', 'XP_064862' ], 1105 [ 'gi|6|pdb|BAB13968.1|1', 'BAB13968' ], 1106 [ 'gi|7|sp|Q16478|GLK5_HUMAN', 'Q16478' ], 1107 [ 'gi|8|pat|US|NP_002079.2', 'NP_002079' ], 1108 [ 'gi|9|bbs|NP_079463.2|', 'NP_079463' ], 1109 [ 'gi|10|gnl|db1|NP_002444.1', 'NP_002444' ], 1110 [ 'gi|11|ref|XP_051877.1|', 'XP_051877' ], 1111 [ 'gi|12|lcl|AAH16829.1|', 'AAH16829' ], 1112 [ 'MY_test_ID', 'MY_test_ID' ] 1113); 1114 1115$hit = $result->next_hit; 1116my $d = shift @valid; 1117is( $hit->name, shift @$d ); 1118is( $hit->accession, shift @$d ); 1119my @accnums = $hit->each_accession_number; 1120foreach my $a (@accnums) { 1121 is( $a, shift @$d ); 1122} 1123$d = shift @valid; 1124$hit = $result->next_hit; 1125is( $hit->name, shift @$d ); 1126is( $hit->accession, shift @$d ); 1127is( $hit->locus, shift @$d ); 1128 1129$hits_left = 23; 1130while ( $hit = $result->next_hit ) { 1131 my $d = shift @valid; 1132 is( $hit->name, shift @$d ); 1133 is( $hit->accession, shift @$d ); 1134 $hits_left--; 1135} 1136is( $hits_left, 0 ); 1137 1138# Parse MEGABLAST 1139 1140# parse the BLAST-like output 1141my $infile = test_input_file('503384.MEGABLAST.2'); 1142my $in = Bio::SearchIO->new( 1143 -file => $infile, 1144 -format => 'blast' 1145); # this is megablast blast-like output 1146my $r = $in->next_result; 1147my @dcompare = ( 1148 [ 1149 'Contig3700', 5631, 396, 785, '0.0', 785, '0.0', 396, 639, 12, 8723, 1150 9434, 1, 4083, 4794, -1 1151 ], 1152 [ 1153 'Contig3997', 12734, 335, 664, '0.0', 664, '0.0', 335, 401, 0, 1282, 1154 1704, 1, 1546, 1968, -1 1155 ], 1156 [ 1157 'Contig634', 858, 245, 486, '1e-136', 486, 1158 '1e-136', 245, 304, 3, 7620, 7941, 1159 1, 1, 321, -1 1160 ], 1161 [ 1162 'Contig1853', 2314, 171, 339, '1e-91', 339, 1163 '1e-91', 171, 204, 0, 6406, 6620, 1164 1, 1691, 1905, 1 1165 ] 1166); 1167 1168like($r->algorithm_reference,qr/A greedy algorithm for aligning DNA sequences/); 1169is( $r->algorithm, 'MEGABLAST' ); 1170is( $r->query_name, '503384' ); 1171is( $r->query_description, '11337 bp 2 contigs' ); 1172is( $r->query_length, 11337 ); 1173is( $r->database_name, 'cneoA.nt' ); 1174is( $r->database_letters, 17206226 ); 1175is( $r->database_entries, 4935 ); 1176is( $r->get_statistic('querylength'), 11318 ); 1177is( $r->get_statistic('effectivedblength'), 17112461 ); 1178is( $r->get_statistic('effectivespace'), 193678833598 ); 1179is( $r->get_statistic('effectivespaceused'), 0 ); 1180 1181$hits_left = 4; 1182while ( my $hit = $r->next_hit ) { 1183 my $d = shift @dcompare; 1184 is( $hit->name, shift @$d ); 1185 is( $hit->length, shift @$d ); 1186 is( $hit->raw_score, shift @$d ); 1187 is( $hit->bits, shift @$d ); 1188 float_is( $hit->significance, shift @$d ); 1189 1190 my $hsp = $hit->next_hsp; 1191 is( $hsp->bits, shift @$d ); 1192 float_is( $hsp->evalue, shift @$d ); 1193 is( $hsp->score, shift @$d ); 1194 is( $hsp->num_identical, shift @$d ); 1195 is( $hsp->gaps('total'), shift @$d ); 1196 is( $hsp->query->start, shift @$d ); 1197 is( $hsp->query->end, shift @$d ); 1198 is( $hsp->query->strand, shift @$d ); 1199 is( $hsp->hit->start, shift @$d ); 1200 is( $hsp->hit->end, shift @$d ); 1201 is( $hsp->hit->strand, shift @$d ); 1202 is( $hsp->n, 1 ); 1203 $hits_left--; 1204} 1205is( $hits_left, 0 ); 1206 1207# Let's test RPS-BLAST 1208 1209my $parser = Bio::SearchIO->new( 1210 -format => 'blast', 1211 -file => test_input_file('ecoli_domains.rpsblast') 1212); 1213 1214$r = $parser->next_result; 1215is( $r->algorithm, 'RPS-BLAST(BLASTP)'); 1216is( $r->algorithm_version, '2.2.4 [Aug-26-2002]'); 1217is( $r->algorithm_reference, undef ); 1218is( $r->query_name, 'gi|1786183|gb|AAC73113.1|' ); 1219is( $r->query_gi, 1786183 ); 1220is( $r->num_hits, 7 ); 1221is( $r->get_statistic('querylength'), 438 ); 1222is( $r->get_statistic('effectivedblength'), 31988 ); 1223is( $r->get_statistic('effectivespace'), 14010744 ); 1224is( $r->get_statistic('effectivespaceused'), 24054976 ); 1225$hit = $r->next_hit; 1226is( $hit->name, 'gnl|CDD|3919' ); 1227float_is( $hit->significance, 0.064 ); 1228is( $hit->bits, 28.3 ); 1229is( $hit->raw_score, 63 ); 1230$hsp = $hit->next_hsp; 1231is( $hsp->query->start, 599 ); 1232is( $hsp->query->end, 655 ); 1233is( $hsp->hit->start, 23 ); 1234is( $hsp->hit->end, 76 ); 1235 1236# Test PSI-BLAST parsing 1237 1238$searchio = Bio::SearchIO->new( 1239 '-format' => 'blast', 1240 '-file' => test_input_file('psiblastreport.out') 1241); 1242 1243$result = $searchio->next_result; 1244like($result->algorithm_reference, qr/Gapped BLAST and PSI-BLAST/); 1245is( $result->database_name, '/home/peter/blast/data/swissprot.pr' ); 1246is( $result->database_entries, 88780 ); 1247is( $result->database_letters, 31984247 ); 1248 1249is( $result->algorithm, 'BLASTP' ); 1250like( $result->algorithm_version, qr/^2\.0\.14/ ); 1251is( $result->query_name, 'CYS1_DICDI' ); 1252is( $result->query_length, 343 ); 1253is( $result->get_statistic('kappa'), 0.0491 ); 1254cmp_ok( $result->get_statistic('lambda'), '==', 0.270 ); 1255cmp_ok( $result->get_statistic('entropy'), '==', 0.230 ); 1256is( $result->get_statistic('dbletters'), 31984247 ); 1257is( $result->get_statistic('dbentries'), 88780 ); 1258is( $result->get_statistic('effective_hsplength'), 49 ); 1259is( $result->get_statistic('querylength'), 294 ); 1260is( $result->get_statistic('effectivedblength'), 27634027 ); 1261is( $result->get_statistic('effectivespace'), 8124403938 ); 1262is( $result->get_statistic('effectivespaceused'), 8124403938 ); 1263is( $result->get_parameter('matrix'), 'BLOSUM62' ); 1264is( $result->get_parameter('gapopen'), 11 ); 1265is( $result->get_parameter('gapext'), 1 ); 1266 1267my @valid_hit_data = ( 1268 [ 'sp|P04988|CYS1_DICDI', 343, 'P04988', '0', 721 ], 1269 [ 'sp|P43295|A494_ARATH', 313, 'P43295', '1e-75', 281 ], 1270 [ 'sp|P25804|CYSP_PEA', 363, 'P25804', '1e-74', 278 ] 1271); 1272my @valid_iter_data = ( 1273 [ 127, 127, 0, 109, 18, 0, 0, 0, 0 ], 1274 [ 157, 40, 117, 2, 38, 0, 109, 3, 5 ] 1275); 1276my $iter_count = 0; 1277 1278while ( $iter = $result->next_iteration ) { 1279 $iter_count++; 1280 my $di = shift @valid_iter_data; 1281 is( $iter->number, $iter_count ); 1282 1283 is( $iter->num_hits, shift @$di ); 1284 is( $iter->num_hits_new, shift @$di ); 1285 is( $iter->num_hits_old, shift @$di ); 1286 is( scalar( $iter->newhits_below_threshold ), shift @$di ); 1287 is( scalar( $iter->newhits_not_below_threshold ), shift @$di ); 1288 is( scalar( $iter->newhits_unclassified ), shift @$di ); 1289 is( scalar( $iter->oldhits_below_threshold ), shift @$di ); 1290 is( scalar( $iter->oldhits_newly_below_threshold ), shift @$di ); 1291 is( scalar( $iter->oldhits_not_below_threshold ), shift @$di ); 1292 1293 my $hit_count = 0; 1294 if ( $iter_count == 1 ) { 1295 while ( $hit = $result->next_hit ) { 1296 my $d = shift @valid_hit_data; 1297 1298 is( $hit->name, shift @$d ); 1299 is( $hit->length, shift @$d ); 1300 is( $hit->accession, shift @$d ); 1301 float_is( $hit->significance, shift @$d ); 1302 is( $hit->bits, shift @$d ); 1303 1304 if ( $hit_count == 1 ) { 1305 my $hsps_left = 1; 1306 while ( my $hsp = $hit->next_hsp ) { 1307 is( $hsp->query->start, 32 ); 1308 is( $hsp->query->end, 340 ); 1309 is( $hsp->hit->start, 3 ); 1310 is( $hsp->hit->end, 307 ); 1311 is( $hsp->length('total'), 316 ); 1312 is( $hsp->start('hit'), $hsp->hit->start ); 1313 is( $hsp->end('query'), $hsp->query->end ); 1314 is( $hsp->strand('sbjct'), $hsp->subject->strand ) 1315 ; # alias for hit 1316 float_is( $hsp->evalue, 1e-75 ); 1317 is( $hsp->score, 712 ); 1318 is( $hsp->bits, 281 ); 1319 is( sprintf( "%.1f", $hsp->percent_identity ), 46.5 ); 1320 is( sprintf( "%.4f", $hsp->frac_identical('query') ), 1321 0.4757 ); 1322 is( sprintf( "%.3f", $hsp->frac_identical('hit') ), 0.482 ); 1323 is( $hsp->gaps, 18 ); 1324 is( $hsp->n, 1 ); 1325 $hsps_left--; 1326 } 1327 is( $hsps_left, 0 ); 1328 } 1329 last if ( $hit_count++ > @valid_hit_data ); 1330 } 1331 } 1332} 1333is( @valid_hit_data, 0 ); 1334is( @valid_iter_data, 0 ); 1335 1336# Test filtering 1337 1338$searchio = Bio::SearchIO->new( 1339 '-format' => 'blast', 1340 '-file' => test_input_file('ecolitst.bls'), 1341 '-signif' => 1e-100 1342); 1343 1344@valid = qw(gb|AAC73113.1|); 1345$r = $searchio->next_result; 1346 1347while ( my $hit = $r->next_hit ) { 1348 is( $hit->name, shift @valid ); 1349} 1350 1351$searchio = Bio::SearchIO->new( 1352 '-format' => 'blast', 1353 '-file' => test_input_file('ecolitst.bls'), 1354 '-score' => 100 1355); 1356 1357@valid = qw(gb|AAC73113.1| gb|AAC76922.1| gb|AAC76994.1|); 1358$r = $searchio->next_result; 1359 1360while ( my $hit = $r->next_hit ) { 1361 is( $hit->name, shift @valid ); 1362} 1363is( @valid, 0 ); 1364 1365$searchio = Bio::SearchIO->new( 1366 '-format' => 'blast', 1367 '-file' => test_input_file('ecolitst.bls'), 1368 '-bits' => 200 1369); 1370 1371@valid = qw(gb|AAC73113.1| gb|AAC76922.1|); 1372$r = $searchio->next_result; 1373 1374while ( my $hit = $r->next_hit ) { 1375 is( $hit->name, shift @valid ); 1376} 1377is( @valid, 0 ); 1378 1379my $filt_func = sub { 1380 my $hit = shift; 1381 $hit->frac_identical('query') >= 0.31; 1382}; 1383 1384$searchio = Bio::SearchIO->new( 1385 '-format' => 'blast', 1386 '-file' => test_input_file('ecolitst.bls'), 1387 '-hit_filter' => $filt_func 1388); 1389 1390@valid = qw(gb|AAC73113.1| gb|AAC76994.1|); 1391$r = $searchio->next_result; 1392 1393while ( my $hit = $r->next_hit ) { 1394 is( $hit->name, shift @valid ); 1395} 1396is( @valid, 0 ); 1397 1398# bl2seq parsing testing 1399 1400# this is blastp bl2seq 1401$searchio = Bio::SearchIO->new( 1402 -format => 'blast', 1403 -file => test_input_file('bl2seq.out') 1404); 1405$result = $searchio->next_result; 1406isa_ok( $result, 'Bio::Search::Result::ResultI' ); 1407is( $result->query_name, '' ); 1408is( $result->algorithm, 'BLASTP' ); 1409is( $result->algorithm_reference, undef ); 1410is( $result->get_statistic('querylength'), 320 ); 1411is( $result->get_statistic('effectivedblength'), 339 ); 1412is( $result->get_statistic('effectivespace'), 108480 ); 1413is( $result->get_statistic('effectivespaceused'), 108480 ); 1414$hit = $result->next_hit; 1415is( $hit->name, 'ALEU_HORVU' ); 1416is( $hit->length, 362 ); 1417$hsp = $hit->next_hsp; 1418is( $hsp->score, 481 ); 1419is( $hsp->bits, 191 ); 1420is( int $hsp->percent_identity, 34 ); 1421float_is( $hsp->evalue, 2e-53 ); 1422is( int( $hsp->frac_conserved * $hsp->length ), 167 ); 1423is( $hsp->query->start, 28 ); 1424is( $hsp->query->end, 343 ); 1425is( $hsp->hit->start, 60 ); 1426is( $hsp->hit->end, 360 ); 1427is( $hsp->gaps, 27 ); 1428 1429# this is blastn bl2seq 1430$searchio = Bio::SearchIO->new( 1431 -format => 'blast', 1432 -file => test_input_file('bl2seq.blastn.rev') 1433); 1434$result = $searchio->next_result; 1435isa_ok( $result, 'Bio::Search::Result::ResultI' ); 1436is( $result->query_name, '' ); 1437is( $result->algorithm, 'BLASTN' ); 1438is( $result->algorithm_reference, undef ); 1439is( $result->query_length, 180 ); 1440is( $result->get_statistic('querylength'), 174 ); 1441is( $result->get_statistic('effectivedblength'), 173 ); 1442is( $result->get_statistic('effectivespace'), 30102 ); 1443is( $result->get_statistic('effectivespaceused'), 30102 ); 1444$hit = $result->next_hit; 1445is( $hit->length, 179 ); 1446is( $hit->name, 'human' ); 1447$hsp = $hit->next_hsp; 1448is( $hsp->score, 27 ); 1449is( $hsp->bits, '54.0' ); 1450is( int $hsp->percent_identity, 88 ); 1451float_is( $hsp->evalue, 2e-12 ); 1452is( int( $hsp->frac_conserved * $hsp->length ), 83 ); 1453is( $hsp->query->start, 94 ); 1454is( $hsp->query->end, 180 ); 1455is( $hsp->query->strand, 1 ); 1456is( $hsp->hit->strand, -1 ); 1457is( $hsp->hit->start, 1 ); 1458is( $hsp->hit->end, 94 ); 1459is( $hsp->gaps, 7 ); 1460is( $hsp->n, 1 ); 1461 1462# this is blastn bl2seq 1463$searchio = Bio::SearchIO->new( 1464 -format => 'blast', 1465 -file => test_input_file('bl2seq.blastn') 1466); 1467$result = $searchio->next_result; 1468isa_ok( $result, 'Bio::Search::Result::ResultI' ); 1469is( $result->query_name, '' ); 1470is( $result->query_length, 180 ); 1471is( $result->algorithm, 'BLASTN' ); 1472is( $result->algorithm_reference, undef ); 1473is( $result->get_statistic('querylength'), 174 ); 1474is( $result->get_statistic('effectivedblength'), 173 ); 1475is( $result->get_statistic('effectivespace'), 30102 ); 1476is( $result->get_statistic('effectivespaceused'), 30102 ); 1477$hit = $result->next_hit; 1478is( $hit->name, 'human' ); 1479is( $hit->length, 179 ); 1480$hsp = $hit->next_hsp; 1481is( $hsp->score, 27 ); 1482is( $hsp->bits, '54.0' ); 1483is( int $hsp->percent_identity, 88 ); 1484float_is( $hsp->evalue, 2e-12 ); 1485is( int( $hsp->frac_conserved * $hsp->length ), 83 ); 1486is( $hsp->query->start, 94 ); 1487is( $hsp->query->end, 180 ); 1488is( $hsp->query->strand, 1 ); 1489is( $hsp->hit->strand, 1 ); 1490is( $hsp->hit->start, 86 ); 1491is( $hsp->hit->end, 179 ); 1492is( $hsp->gaps, 7 ); 1493is( $hsp->n, 1 ); 1494 1495# this is blastn bl2seq+ 1496$searchio = Bio::SearchIO->new( 1497 -format => 'blast', 1498 -file => test_input_file('bl2seq+.blastn') 1499); 1500$result = $searchio->next_result; 1501isa_ok( $result, 'Bio::Search::Result::ResultI' ); 1502is( $result->query_name, 'gi|2695846|emb|Y13255.1|' ); 1503is( $result->query_description, 1504 'Acipenser baeri mRNA for immunoglobulin heavy chain, clone ScH 3.3' 1505); 1506is( $result->query_length, 606 ); 1507is( $result->algorithm, 'BLASTN' ); 1508is( $result->algorithm_version, '2.2.29+' ); 1509is( $result->algorithm_reference, undef ); 1510is( $result->get_statistic('effectivespaceused'), 352836 ); 1511is( $result->get_statistic('kappa'), 0.621 ); 1512is( $result->get_statistic('kappa_gapped'), '0.460' ); 1513is( $result->get_statistic('lambda'), 1.33 ); 1514is( $result->get_statistic('lambda_gapped'), 1.28 ); 1515is( $result->get_statistic('entropy'), 1.12 ); 1516is( $result->get_statistic('entropy_gapped'), '0.850' ); 1517$hit = $result->next_hit; 1518is( $hit->name, 'gi|2695846|emb|Y13255.1|' ); 1519is( $hit->description, 1520 'Acipenser baeri mRNA for immunoglobulin heavy chain, clone ScH 3.3' 1521); 1522is( $hit->length, 606 ); 1523$hsp = $hit->next_hsp; 1524is( $hsp->score, 606 ); 1525is( $hsp->bits, 1120 ); 1526is( $hsp->percent_identity, 100 ); 1527float_is( $hsp->evalue, '0.0' ); 1528is( $hsp->query->start, 1 ); 1529is( $hsp->query->end, 606 ); 1530is( $hsp->query->strand, 1 ); 1531is( $hsp->hit->strand, 1 ); 1532is( $hsp->hit->start, 1 ); 1533is( $hsp->hit->end, 606 ); 1534is( $hsp->gaps, 0 ); 1535is( $hsp->n, 1 ); 1536 1537# this is blastp bl2seq 1538$searchio = Bio::SearchIO->new( 1539 -format => 'blast', 1540 -file => test_input_file('bl2seq.bug940.out') 1541); 1542$result = $searchio->next_result; 1543isa_ok( $result, 'Bio::Search::Result::ResultI' ); 1544is( $result->query_name, 'zinc' ); 1545is( $result->algorithm, 'BLASTP' ); 1546is( $result->query_description, 1547'finger protein 135 (clone pHZ-17) [Homo sapiens]. neo_id RS.ctg14243-000000.6.0' 1548); 1549is( $result->query_length, 469 ); 1550is( $result->get_statistic('querylength'), 446 ); 1551is( $result->get_statistic('effectivedblength'), 446 ); 1552is( $result->get_statistic('effectivespace'), 198916 ); 1553is( $result->get_statistic('effectivespaceused'), 198916 ); 1554$hit = $result->next_hit; 1555is( $hit->name, 'gi|4507985|' ); 1556is( $hit->ncbi_gi, 4507985 ); 1557is( $hit->description, 1558'zinc finger protein 135 (clone pHZ-17) [Homo sapiens]. neo_id RS.ctg14243-000000.6.0' 1559); 1560is( $hit->length, 469 ); 1561$hsp = $hit->next_hsp; 1562is( $hsp->score, 1626 ); 1563is( $hsp->bits, 637 ); 1564is( int $hsp->percent_identity, 66 ); 1565float_is( $hsp->evalue, 0.0 ); 1566is( int( $hsp->frac_conserved * $hsp->length ), 330 ); 1567is( $hsp->query->start, 121 ); 1568is( $hsp->query->end, 469 ); 1569is( $hsp->hit->start, 1 ); 1570is( $hsp->hit->end, 469 ); 1571is( $hsp->gaps, 120 ); 1572is( $hsp->n, 1 ); 1573 1574ok( $hit->next_hsp ); # there is more than one HSP here, 1575 # make sure it is parsed at least 1576 1577# cannot distinguish between blastx and tblastn reports 1578# so we're only testing a blastx report for now 1579 1580# this is blastx bl2seq 1581$searchio = Bio::SearchIO->new( 1582 -format => 'blast', 1583 -file => test_input_file('bl2seq.blastx.out') 1584); 1585$result = $searchio->next_result; 1586isa_ok( $result, 'Bio::Search::Result::ResultI' ); 1587is( $result->query_name, 'AE000111.1' ); 1588is( $result->query_description, 1589 'Escherichia coli K-12 MG1655 section 1 of 400 of the complete genome' ); 1590is( $result->algorithm, 'BLASTX' ); 1591is( $result->algorithm_reference, undef ); 1592is( $result->query_length, 720 ); 1593is( $result->get_statistic('querylength'), undef ); 1594is( $result->get_statistic('effectivedblength'), 787 ); 1595is( $result->get_statistic('effectivespace'), undef ); 1596is( $result->get_statistic('effectivespaceused'), 162122 ); 1597$hit = $result->next_hit; 1598is( $hit->name, 'AK1H_ECOLI' ); 1599is( $hit->description, 1600'P00561 Bifunctional aspartokinase/homoserine dehydrogenase I (AKI-HDI) [Includes: Aspartokinase I ; Homoserine dehydrogenase I ]' 1601); 1602is( $hit->length, 820 ); 1603$hsp = $hit->next_hsp; 1604is( $hsp->score, 634 ); 1605is( $hsp->bits, 248 ); 1606is( int $hsp->percent_identity, 100 ); 1607float_is( $hsp->evalue, 2e-70 ); 1608is( int( $hsp->frac_conserved * $hsp->length ), 128 ); 1609is( $hsp->query->start, 1 ); 1610is( $hsp->query->end, 384 ); 1611is( $hsp->hit->start, 1 ); 1612is( $hsp->hit->end, 128 ); 1613is( $hsp->gaps, 0 ); 1614is( $hsp->query->frame, 0 ); 1615is( $hsp->hit->frame, 0 ); 1616is( $hsp->query->strand, -1 ); 1617is( $hsp->hit->strand, 0 ); 1618is( $hsp->n, 1 ); 1619 1620# this is tblastx bl2seq (self against self) 1621$searchio = Bio::SearchIO->new( 1622 -format => 'blast', 1623 -file => test_input_file('bl2seq.tblastx.out') 1624); 1625$result = $searchio->next_result; 1626isa_ok( $result, 'Bio::Search::Result::ResultI' ); 1627is( $result->query_name, 'Escherichia' ); 1628is( $result->algorithm, 'TBLASTX' ); 1629is( $result->algorithm_reference, undef ); 1630is( $result->query_description, 1631 'coli K-12 MG1655 section 1 of 400 of the complete genome' ); 1632is( $result->query_length, 720 ); 1633is( $result->get_statistic('querylength'), undef ); 1634is( $result->get_statistic('effectivedblength'), 221 ); 1635is( $result->get_statistic('effectivespace'), undef ); 1636is( $result->get_statistic('effectivespaceused'), 48620 ); 1637$hit = $result->next_hit; 1638is( $hit->name, 'gi|1786181|gb|AE000111.1|AE000111' ); 1639is( $hit->ncbi_gi, 1786181 ); 1640is( $hit->description, 1641 'Escherichia coli K-12 MG1655 section 1 of 400 of the complete genome' ); 1642is( $hit->length, 720 ); 1643$hsp = $hit->next_hsp; 1644is( $hsp->score, 1118 ); 1645is( $hsp->bits, 515 ); 1646is( int $hsp->percent_identity, 95 ); 1647float_is( $hsp->evalue, 1e-151 ); 1648is( int( $hsp->frac_conserved * $hsp->length ), 229 ); 1649is( $hsp->query->start, 1 ); 1650is( $hsp->query->end, 720 ); 1651is( $hsp->hit->start, 1 ); 1652is( $hsp->hit->end, 720 ); 1653is( $hsp->gaps, 0 ); 1654is( $hsp->query->frame, 0 ); 1655is( $hsp->hit->frame, 0 ); 1656is( $hsp->query->strand, 1 ); 1657is( $hsp->hit->strand, 1 ); 1658is( $hsp->n, 1 ); 1659 1660# this is NCBI tblastn 1661$searchio = Bio::SearchIO->new( 1662 -format => 'blast', 1663 -file => test_input_file('tblastn.out') 1664); 1665$result = $searchio->next_result; 1666isa_ok( $result, 'Bio::Search::Result::ResultI' ); 1667is( $result->algorithm, 'TBLASTN' ); 1668like($result->algorithm_reference,qr/Gapped BLAST and PSI-BLAST/); 1669is( $result->get_statistic('querylength'), 102 ); 1670is( $result->get_statistic('effectivedblength'), 4342 ); 1671is( $result->get_statistic('effectivespace'), 442884 ); 1672is( $result->get_statistic('effectivespaceused'), 442884 ); 1673$hit = $result->next_hit; 1674is( $hit->name, 'gi|10040111|emb|AL390796.6|AL390796' ); 1675 1676# Test Blast parsing with B=0 (WU-BLAST) 1677$searchio = Bio::SearchIO->new( 1678 -file => test_input_file('no_hsps.blastp'), 1679 -format => 'blast' 1680); 1681$result = $searchio->next_result; 1682like($result->algorithm_reference,qr/Gish, W. \(1996-2003\)/); 1683is( $result->query_name, 'mgri:MG00189.3' ); 1684$hit = $result->next_hit; 1685is( $hit->name, 'mgri:MG00189.3' ); 1686is( $hit->description, 'hypothetical protein 6892 8867 +' ); 1687is( $hit->bits, 3098 ); 1688float_is( $hit->significance, 0. ); 1689 1690$hit = $result->next_hit; 1691is( $hit->name, 'fgram:FG01141.1' ); 1692is( $hit->description, 'hypothetical protein 47007 48803 -' ); 1693is( $hit->bits, 2182 ); 1694float_is( $hit->significance, 4.2e-226 ); 1695is( $result->num_hits, 415 ); 1696 1697# Let's now test if _guess_format is doing its job correctly 1698my %pair = ( 1699 'filename.blast' => 'blast', 1700 'filename.bls' => 'blast', 1701 'f.blx' => 'blast', 1702 'f.tblx' => 'blast', 1703 'fast.bls' => 'blast', 1704 'f.fasta' => 'fasta', 1705 'f.fa' => 'fasta', 1706 'f.fx' => 'fasta', 1707 'f.fy' => 'fasta', 1708 'f.ssearch' => 'fasta', 1709 'f.SSEARCH.m9' => 'fasta', 1710 'f.m9' => 'fasta', 1711 'f.psearch' => 'fasta', 1712 'f.osearch' => 'fasta', 1713 'f.exon' => 'exonerate', 1714 'f.exonerate' => 'exonerate', 1715 'f.blastxml' => 'blastxml', 1716 'f.xml' => 'blastxml' 1717); 1718while ( my ( $file, $expformat ) = each %pair ) { 1719 is( Bio::SearchIO->_guess_format($file), 1720 $expformat, "$expformat for $file" ); 1721} 1722 1723# Test Wes Barris's reported bug when parsing blastcl3 output which 1724# has integer overflow 1725 1726$searchio = Bio::SearchIO->new( 1727 -file => test_input_file('hsinsulin.blastcl3.blastn'), 1728 -format => 'blast' 1729); 1730$result = $searchio->next_result; 1731is( $result->query_name, 'human' ); 1732is( $result->database_letters(), '-24016349' ); 1733 1734# this is of course not the right length, but is the what blastcl3 1735# reports, the correct value is 1736is( $result->get_statistic('dbletters'), '192913178' ); 1737is( $result->get_statistic('dbentries'), '1867771' ); 1738 1739# test for links and groups being parsed out of WU-BLAST properly 1740$searchio = Bio::SearchIO->new( 1741 -format => 'blast', 1742 -file => test_input_file('brassica_ATH.WUBLASTN') 1743); 1744ok( $result = $searchio->next_result ); 1745ok( $hit = $result->next_hit ); 1746ok( $hsp = $hit->next_hsp ); 1747is( $hsp->links, '(1)-3-2' ); 1748is( $hsp->query->strand, 1 ); 1749is( $hsp->hit->strand, 1 ); 1750is( $hsp->hsp_group, '1' ); 1751is( $hsp->n, 1 ); 1752## Web blast result parsing 1753 1754$searchio = Bio::SearchIO->new( 1755 -format => 'blast', 1756 -file => test_input_file('catalase-webblast.BLASTP') 1757); 1758ok( $result = $searchio->next_result ); 1759is( $result->rid, '1118324516-16598-103707467515.BLASTQ1' ); 1760ok( $hit = $result->next_hit ); 1761is( $hit->name, 'gi|40747822|gb|EAA66978.1|', 'full hit name' ); 1762is( $hit->accession, 'EAA66978', 'hit accession' ); 1763is( $hit->ncbi_gi, 40747822 ); 1764ok( $hsp = $hit->next_hsp ); 1765is( $hsp->query->start, 1, 'query start' ); 1766is( $hsp->query->end, 528, 'query start' ); 1767is( $hsp->n, 1 ); 1768 1769# tests for new BLAST 2.2.13 output 1770$searchio = Bio::SearchIO->new( 1771 -format => 'blast', 1772 -file => test_input_file('new_blastn.txt') 1773); 1774 1775$result = $searchio->next_result; 1776is( $result->database_name, 1777'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences)' 1778); 1779is( $result->database_entries, 3742891 ); 1780is( $result->database_letters, 16670205594 ); 1781is( $result->algorithm, 'BLASTN' ); 1782is( $result->algorithm_version, '2.2.13 [Nov-27-2005]' ); 1783like($result->algorithm_reference, qr/Gapped BLAST and PSI-BLAST/); 1784is( $result->rid, '1141079027-8324-8848328247.BLASTQ4' ); 1785is( $result->query_name, 'pyrR,' ); 1786is( $result->query_length, 558 ); 1787is( $result->get_statistic('kappa'), '0.711' ); 1788is( $result->get_statistic('kappa_gapped'), '0.711' ); 1789is( $result->get_statistic('lambda'), '1.37' ); 1790is( $result->get_statistic('lambda_gapped'), '1.37' ); 1791is( $result->get_statistic('entropy'), '1.31' ); 1792is( $result->get_statistic('entropy_gapped'), '1.31' ); 1793is( $result->get_statistic('dbletters'), '-509663586' ); 1794is( $result->get_statistic('dbentries'), 3742891 ); 1795is( $result->get_statistic('effective_hsplength'), undef ); 1796is( $result->get_statistic('effectivespace'), 8935230198384 ); 1797is( 1798 $result->get_statistic( 1799 'number_of_hsps_better_than_expect_value_cutoff_without_gapping'), 1800 0 1801); 1802is( $result->get_statistic('number_of_hsps_gapped'), 1771 ); 1803is( $result->get_statistic('number_of_hsps_successfully_gapped'), 0 ); 1804is( $result->get_statistic('length_adjustment'), 22 ); 1805is( $result->get_statistic('querylength'), 536 ); 1806is( $result->get_statistic('effectivedblength'), 16670205594 ); 1807is( $result->get_statistic('effectivespaceused'), 8891094027712 ); 1808is( $result->get_parameter('matrix'), 'blastn matrix:1 -3' ); 1809is( $result->get_parameter('gapopen'), 5 ); 1810is( $result->get_parameter('gapext'), 2 ); 1811is( $result->get_statistic('S2'), '60' ); 1812is( $result->get_statistic('S2_bits'), '119.4' ); 1813float_is( $result->get_parameter('expect'), '1e-23' ); 1814is( $result->get_statistic('num_extensions'), '117843' ); 1815 1816@valid = ( 1817 [ 1818 'gi|41400296|gb|AE016958.1|', 4829781, 'AE016958', 41400296, '6e-059', 1819 119, 236 1820 ], 1821 [ 1822 'gi|54013472|dbj|AP006618.1|', 6021225, 'AP006618', 54013472, '4e-026', 1823 64, 127 1824 ], 1825 [ 1826 'gi|57546753|dbj|BA000030.2|', 9025608, 'BA000030', 57546753, '1e-023', 1827 60, 119 1828 ] 1829); 1830$count = 0; 1831 1832while ( $hit = $result->next_hit ) { 1833 my $d = shift @valid; 1834 1835 is( $hit->name, shift @$d ); 1836 is( $hit->length, shift @$d ); 1837 is( $hit->accession, shift @$d ); 1838 is( $hit->ncbi_gi, shift @$d ); 1839 float_is( $hit->significance, shift @$d ); 1840 is( $hit->raw_score, shift @$d ); 1841 is( $hit->bits, shift @$d ); 1842 1843 if ( $count == 0 ) { 1844 my $hsps_left = 1; 1845 while ( my $hsp = $hit->next_hsp ) { 1846 is( $hsp->query->start, 262 ); 1847 is( $hsp->query->end, 552 ); 1848 is( $hsp->hit->start, 1166897 ); 1849 is( $hsp->hit->end, 1167187 ); 1850 is( $hsp->length('total'), 291 ); 1851 is( $hsp->hit_features, 'PyrR' ); 1852 is( $hsp->start('hit'), $hsp->hit->start ); 1853 is( $hsp->end('query'), $hsp->query->end ); 1854 is( $hsp->strand('sbjct'), $hsp->subject->strand ); # alias for hit 1855 float_is( $hsp->evalue, 6e-59 ); 1856 is( $hsp->score, 119 ); 1857 is( $hsp->bits, 236 ); 1858 is( sprintf( "%.2f", $hsp->percent_identity ), 85.22 ); 1859 is( sprintf( "%.4f", $hsp->frac_identical('query') ), 0.8522 ); 1860 is( sprintf( "%.4f", $hsp->frac_identical('hit') ), 0.8522 ); 1861 is( $hsp->gaps, 0 ); 1862 is( $hsp->n, 1 ); 1863 $hsps_left--; 1864 } 1865 is( $hsps_left, 0 ); 1866 } 1867 last if ( $count++ > @valid ); 1868} 1869is( @valid, 0 ); 1870 1871# Bug 2189 1872$searchio = Bio::SearchIO->new( 1873 -format => 'blast', 1874 -file => test_input_file('blastp2215.blast') 1875); 1876 1877$result = $searchio->next_result; 1878is( $result->database_entries, 4460989 ); 1879is( $result->database_letters, 1533424333 ); 1880is( $result->algorithm, 'BLASTP' ); 1881is( $result->algorithm_version, '2.2.15 [Oct-15-2006]' ); 1882is( $result->rid, '1169055516-21385-22799250964.BLASTQ4' ); 1883is( $result->query_name, 'gi|15608519|ref|NP_215895.1|' ); 1884is( $result->query_gi, 15608519 ); 1885is( $result->query_length, 193 ); 1886@hits = $result->hits; 1887is( scalar(@hits), 10 ); 1888is( $hits[1]->accession, '1W30' ); 1889is( $hits[4]->significance, '2e-72' ); 1890is( $hits[7]->bits, '254' ); 1891$result = $searchio->next_result; 1892is( $result->database_entries, 4460989 ); 1893is( $result->database_letters, 1533424333 ); 1894is( $result->algorithm, 'BLASTP' ); 1895is( $result->algorithm_version, '2.2.15 [Oct-15-2006]' ); 1896is( $result->query_name, 'gi|15595598|ref|NP_249092.1|' ); 1897is( $result->query_length, 423 ); 1898@hits = $result->hits; 1899is( scalar(@hits), 10 ); 1900is( $hits[1]->accession, 'ZP_00972546' ); 1901is( $hits[2]->ncbi_gi, 116054132 ); 1902is( $hits[4]->significance, '0.0' ); 1903is( $hits[7]->bits, 624 ); 1904 1905# Bug 2246 1906$searchio = Bio::SearchIO->new( 1907 -format => 'blast', 1908 -verbose => -1, 1909 -file => test_input_file('bug2246.blast') 1910); 1911$result = $searchio->next_result; 1912is( 1913 $result->get_statistic( 1914 'number_of_hsps_better_than_expect_value_cutoff_without_gapping'), 1915 0 1916); 1917is( $result->get_statistic('number_of_hsps_gapped'), 7049 ); 1918is( $result->get_statistic('number_of_hsps_successfully_gapped'), 55 ); 1919is( $result->get_statistic('length_adjustment'), 125 ); 1920is( $result->get_statistic('querylength'), 68 ); 1921is( $result->get_statistic('effectivedblength'), 1045382588 ); 1922is( $result->get_statistic('effectivespace'), 71086015984 ); 1923is( $result->get_statistic('effectivespaceused'), 71086015984 ); 1924$hit = $result->next_hit; 1925is $hit->name, 'UniRef50_Q9X0H5'; 1926is $hit->length, 0; 1927is $hit->accession, 'UniRef50_Q9X0H5'; 1928is $hit->description, 'Cluster: Histidyl-tRNA synthetase; n=4; Thermoto...'; 1929is $hit->bits, 23; 1930float_is( $hit->significance, 650 ); 1931 1932# Bug 1986 1933$searchio = Bio::SearchIO->new( 1934 -format => 'blast', 1935 -verbose => -1, 1936 -file => test_input_file('bug1986.blastp') 1937); 1938$result = $searchio->next_result; 1939is( $result->get_statistic('querylength'), 335 ); 1940is( $result->get_statistic('effectivedblength'), 18683311 ); 1941is( $result->get_statistic('effectivespace'), 6258909185 ); 1942is( $result->get_statistic('effectivespaceused'), 6258909185 ); 1943$hit = $result->next_hit; 1944is $hit->name, 'ENSP00000350182'; 1945is $hit->length, 425; 1946is $hit->accession, 'ENSP00000350182'; 1947is $hit->description, 1948'pep:novel clone::BX322644.8:4905:15090:-1 gene:ENSG00000137397 transcript:ENST00000357569'; 1949is $hit->raw_score, 301; 1950is $hit->bits, 120; 1951float_is( $hit->significance, 3e-27 ); 1952$hit = $result->next_hit; 1953is $hit->name, 'ENSP00000327738'; 1954is $hit->length, 468; 1955is $hit->accession, 'ENSP00000327738'; 1956is $hit->description, 1957'pep:known-ccds chromosome:NCBI36:4:189297592:189305643:1 gene:ENSG00000184108 transcript:ENST00000332517 CCDS3851.1'; 1958is $hit->raw_score, 289; 1959is $hit->bits, 115; 1960float_is( $hit->significance, 8e-26 ); 1961 1962# Bug 1986, pt. 2 1963 1964# handle at least the first iteration with BLAST searches using databases 1965# containing non-unique IDs 1966 1967my $file = test_input_file('bug1986.blast2'); 1968my %unique_accs; 1969open my $IN, '<', $file or die "Could not read file '$file': $!\n"; 1970 1971while (<$IN>) { 1972 last if (/^Sequences/); 1973} 1974$count = 1; 1975while (<$IN>) { 1976 chomp; 1977 next if m{^\s*$}; 1978 next unless ($_); 1979 last if m{^>}; 1980 my ($accession) = split(/\s+/); 1981 1982 #print "Real Hit $count = $accession\n"; 1983 $unique_accs{$accession}++; 1984 1985 #last if ($count == 10); 1986 ++$count; 1987} 1988close $IN; 1989 1990is( $count, 495 ); 1991is( scalar( keys %unique_accs ), 490 ); 1992 1993my %search_accs; 1994 1995$searchio = Bio::SearchIO->new( 1996 -format => 'blast', 1997 -verbose => -1, 1998 -file => $file 1999); 2000$result = $searchio->next_result; 2001$count = 1; 2002while ( my $hit = $result->next_hit ) { 2003 $search_accs{ $hit->accession }++; 2004 $count++; 2005} 2006 2007is( $count, 495 ); 2008is( scalar( keys %search_accs ), 490 ); 2009 2010is_deeply( \%unique_accs, \%search_accs ); 2011 2012# bug 2391 - long query names 2013 2014$file = test_input_file('bug2391.megablast'); 2015 2016$searchio = Bio::SearchIO->new( 2017 -format => 'blast', 2018 -file => $file 2019); 2020$result = $searchio->next_result; 2021 2022# data is getting munged up with long names 2023is( $result->query_name, 2024 'c6_COX;c6_QBL;6|31508172;31503325;31478402|rs36223351|1|dbSNP|C/G' ); 2025is( $result->query_description, '' ); 2026is( $result->algorithm, 'MEGABLAST' ); 2027is( 2028 $result->get_statistic( 2029 'number_of_hsps_better_than_expect_value_cutoff_without_gapping'), 2030 undef 2031); 2032is( $result->get_statistic('number_of_hsps_gapped'), 0 ); 2033is( $result->get_statistic('number_of_hsps_successfully_gapped'), 0 ); 2034is( $result->get_statistic('length_adjustment'), 16 ); 2035is( $result->get_statistic('querylength'), 85 ); 2036is( $result->get_statistic('effectivedblength'), 59358266 ); 2037is( $result->get_statistic('effectivespace'), 5045452610 ); 2038is( $result->get_statistic('effectivespaceused'), 5045452610 ); 2039 2040# bug 2399 - catching Expect(n) values 2041 2042$file = test_input_file('bug2399.tblastn'); 2043 2044$searchio = Bio::SearchIO->new( 2045 -format => 'blast', 2046 -file => $file 2047); 2048my $total_n = 0; 2049while ( my $query = $searchio->next_result ) { 2050 while ( my $subject = $query->next_hit ) { 2051 $total_n += grep { $_->n } $subject->hsps; 2052 } 2053} 2054is( $total_n, 80 ); # n = at least 1, so this was changed to reflect that 2055 2056sub cmp_evalue ($$) { 2057 my ( $tval, $aval ) = @_; 2058 is( sprintf( "%g", $tval ), sprintf( "%g", $aval ) ); 2059} 2060 2061# bug 3064 - All-gap Query/Subject lines for BLAST+ do not have numbering 2062 2063$file = test_input_file('blast_plus.blastp'); 2064 2065$searchio = Bio::SearchIO->new( 2066 -format => 'blast', 2067 -file => $file 2068); 2069 2070my $total_hsps = 0; 2071while ( my $query = $searchio->next_result ) { 2072 is( $query->get_statistic('querylength'), undef ); 2073 is( $query->get_statistic('effectivedblength'), undef ); 2074 is( $query->get_statistic('effectivespace'), undef ); 2075 is( $query->get_statistic('effectivespaceused'), 55770 ); 2076 while ( my $subject = $query->next_hit ) { 2077 while ( my $hsp = $subject->next_hsp ) { 2078 $total_hsps++; 2079 if ( $total_hsps == 1 ) { 2080 is( $hsp->start('query'), 5 ); 2081 is( $hsp->start('hit'), 3 ); 2082 is( $hsp->end('query'), 220 ); 2083 is( $hsp->end('hit'), 308 ); 2084 is( length( $hsp->query_string ), length( $hsp->hit_string ) ); 2085 } 2086 } 2087 } 2088} 2089 2090is( $total_hsps, 2 ); 2091 2092# BLAST 2.2.20+ output file ZABJ4EA7014.CH878695.1.blast.txt 2093# Tests SearchIO blast parsing of 'Features in/flanking this part of a subject sequence' 2094$searchio = Bio::SearchIO->new( 2095 -format => 'blast', 2096 -file => test_input_file('ZABJ4EA7014.CH878695.1.blast.txt') 2097); 2098 2099$result = $searchio->next_result; 2100 2101# Parse BLAST header details 2102is( $result->algorithm, 'BLASTN' ); 2103is( $result->algorithm_version, '2.2.20+' ); 2104like($result->algorithm_reference, qr/A greedy algorithm for aligning DNA\s+sequences/); 2105is( $result->database_name, 2106 'human build 35 genome database (reference assembly only)' ); 2107is( $result->database_entries, 378 ); 2108is( $result->database_letters, 2866055344 ); 2109is( $result->query_name, 'gi|95131563|gb|CH878695.1|' ); 2110is( $result->query_description, 2111 'Homo sapiens 211000035829648 genomic scaffold' ); 2112is( $result->query_length, 29324 ); 2113 2114# Parse BLAST footer details 2115is( $result->get_statistic('posted_date'), 'Jul 26, 2007 3:20 PM' ); 2116is( $result->get_statistic('dbletters'), -1428911948 ); 2117is( $result->get_statistic('lambda'), '1.33' ); 2118is( $result->get_statistic('kappa'), '0.621' ); 2119is( $result->get_statistic('entropy'), '1.12' ); 2120is( $result->get_statistic('lambda_gapped'), '1.28' ); 2121is( $result->get_statistic('kappa_gapped'), '0.460' ); 2122is( $result->get_statistic('entropy_gapped'), '0.850' ); 2123is( $result->get_parameter('matrix'), 'blastn matrix:1 -2' ); 2124is( $result->get_parameter('gapopen'), 0 ); 2125is( $result->get_parameter('gapext'), 0 ); 2126is( $result->get_statistic('num_extensions'), 216 ); 2127is( $result->get_statistic('num_successful_extensions'), 216 ); 2128is( $result->get_parameter('expect'), '0.01' ); 2129is( $result->get_statistic('seqs_better_than_cutoff'), 10 ); 2130is( 2131 $result->get_statistic( 2132 'number_of_hsps_better_than_expect_value_cutoff_without_gapping'), 2133 0 2134); 2135is( $result->get_statistic('number_of_hsps_gapped'), 216 ); 2136is( $result->get_statistic('number_of_hsps_successfully_gapped'), 212 ); 2137is( $result->get_statistic('length_adjustment'), 34 ); 2138is( $result->get_statistic('querylength'), 29290 ); 2139is( $result->get_statistic('effectivedblength'), 2866042492 ); 2140is( $result->get_statistic('effectivespace'), 83946384590680 ); 2141is( $result->get_statistic('effectivespaceused'), 83946384590680 ); 2142is( $result->get_statistic('A'), 0 ); 2143is( $result->get_statistic('X1'), 23 ); 2144is( $result->get_statistic('X1_bits'), '44.2' ); 2145is( $result->get_statistic('X2'), 32 ); 2146is( $result->get_statistic('X2_bits'), '59.1' ); 2147is( $result->get_statistic('X3'), 54 ); 2148is( $result->get_statistic('X3_bits'), '99.7' ); 2149is( $result->get_statistic('S1'), 23 ); 2150is( $result->get_statistic('S1_bits'), '43.6' ); 2151is( $result->get_statistic('S2'), 29 ); 2152is( $result->get_statistic('S2_bits'), '54.7' ); 2153 2154# Skip the 1st hit. It doesn't have any 'Features in/flanking this part of subject sequence:' 2155$hit = $result->next_hit; 2156 2157# The 2nd hit has hsps with 'Features flanking this part of subject sequence:' 2158$hit = $result->next_hit; 2159is( $hit->name, 'gi|51459264|ref|NT_077382.3|Hs1_77431' ); 2160is( $hit->description, 'Homo sapiens chromosome 1 genomic contig' ); 2161is( $hit->length, 237250 ); 2162 2163# In the 2nd hit, look at the 1st hsp 2164$hsp = $hit->next_hsp; 2165is( $hsp->hit_features, 2166"16338 bp at 5' side: PRAME family member 8 11926 bp at 3' side: PRAME family member 9" 2167); 2168is( $hsp->bits, 7286 ); 2169is( $hsp->score, 3945 ); 2170is( $hsp->expect, '0.0' ); 2171is( $hsp->hsp_length, 6145 ); 2172is( $hsp->num_identical, 5437 ); 2173is( int sprintf( "%.2f", $hsp->percent_identity ), 88 ); 2174is( $hsp->gaps, 152 ); 2175is( $hsp->start('query'), 23225 ); 2176is( $hsp->start('sbjct'), 86128 ); 2177is( $hsp->end('query'), 29324 ); 2178is( $hsp->end('sbjct'), 92165 ); 2179 2180# In the 2nd hit, look at the 2nd hsp 2181$hsp = $hit->next_hsp; 2182is( $hsp->hit_features, 2183"25773 bp at 5' side: PRAME family member 3 3198 bp at 3' side: PRAME family member 5" 2184); 2185is( $hsp->bits, 4732 ); 2186is( $hsp->score, 2562 ); 2187is( $hsp->expect, '0.0' ); 2188is( $hsp->hsp_length, 4367 ); 2189is( $hsp->num_identical, 3795 ); 2190is( int sprintf( "%.2f", $hsp->percent_identity ), 86 ); 2191is( $hsp->gaps, 178 ); 2192is( $hsp->start('query'), 23894 ); 2193is( $hsp->start('sbjct'), 37526 ); 2194is( $hsp->end('query'), 28193 ); 2195is( $hsp->end('sbjct'), 41781 ); 2196 2197# In the 2nd hit, look at the 3rd hsp 2198$hsp = $hit->next_hsp; 2199is( $hsp->hit_features, 2200"16338 bp at 5' side: PRAME family member 8 14600 bp at 3' side: PRAME family member 9" 2201); 2202is( $hsp->bits, 3825 ); 2203is( $hsp->score, 2071 ); 2204is( $hsp->expect, '0.0' ); 2205is( $hsp->hsp_length, 3406 ); 2206is( $hsp->num_identical, 2976 ); 2207is( int sprintf( "%.2f", $hsp->percent_identity ), 87 ); 2208is( $hsp->gaps, 89 ); 2209is( $hsp->start('query'), 14528 ); 2210is( $hsp->start('sbjct'), 86128 ); 2211is( $hsp->end('query'), 17886 ); 2212is( $hsp->end('sbjct'), 89491 ); 2213 2214# In the 2nd hit, look at the 4th hsp 2215$hsp = $hit->next_hsp; 2216is( $hsp->hit_features, 2217"29101 bp at 5' side: PRAME family member 8 2120 bp at 3' side: PRAME family member 9" 2218); 2219is( $hsp->bits, 3241 ); 2220is( $hsp->score, 1755 ); 2221is( $hsp->expect, '0.0' ); 2222is( $hsp->hsp_length, 3158 ); 2223is( $hsp->num_identical, 2711 ); 2224is( int sprintf( "%.2f", $hsp->percent_identity ), 85 ); 2225is( $hsp->gaps, 123 ); 2226is( $hsp->start('query'), 23894 ); 2227is( $hsp->start('sbjct'), 98891 ); 2228is( $hsp->end('query'), 27005 ); 2229is( $hsp->end('sbjct'), 101971 ); 2230 2231# In the 2nd hit, look at the 5th hsp 2232$hsp = $hit->next_hsp; 2233is( $hsp->hit_features, "PRAME family member 13" ); 2234is( $hsp->bits, 3142 ); 2235is( $hsp->score, 1701 ); 2236is( $hsp->expect, '0.0' ); 2237is( $hsp->hsp_length, 2507 ); 2238is( $hsp->num_identical, 2249 ); 2239is( int sprintf( "%.2f", $hsp->percent_identity ), 89 ); 2240is( $hsp->gaps, 63 ); 2241is( $hsp->start('query'), 3255 ); 2242is( $hsp->start('sbjct'), 128516 ); 2243is( $hsp->end('query'), 5720 ); 2244is( $hsp->end('sbjct'), 131000 ); 2245 2246# testing for Bug #3298 2247$searchio = Bio::SearchIO->new( 2248 '-format' => 'blast', 2249 '-file' => test_input_file('multiresult_blastn+.bls') 2250); 2251 2252is ($searchio->next_result->algorithm_version, '2.2.25+', "testing Bug 3298"); 2253is ($searchio->next_result->algorithm_version, '2.2.25+', "testing Bug 3298"); 2254is ($searchio->next_result->algorithm_version, '2.2.25+', "testing Bug 3298"); 2255 2256# testing for Bug #3251 2257$searchio = Bio::SearchIO->new( 2258 '-format' => 'blast', 2259 '-file' => test_input_file('rpsblast_no_hits.bls') 2260); 2261 2262is ($searchio->next_result->database_name, 'CDD.v.2.13', "testing Bug 3251"); 2263is ($searchio->next_result->database_name, 'CDD.v.2.13', "testing Bug 3251"); 2264is ($searchio->next_result->database_name, 'CDD.v.2.13', "testing Bug 3251"); 2265