1# -*-Perl-*- Test Harness script for Bioperl 2# $Id$ 3 4use strict; 5 6BEGIN { 7 use Bio::Root::Test; 8 9 test_begin(-tests => 561, 10 -requires_module => 'Data::Stag'); 11 12 use_ok('Bio::SeqIO'); 13} 14 15 16my $verbose = test_debug(); 17 18################################## GenBank ################################## 19 20my $ast = Bio::SeqIO->new(-format => 'gbdriver' , 21 -verbose => $verbose, 22 -file => test_input_file("roa1.genbank")); 23$ast->verbose($verbose); 24my $as = $ast->next_seq(); 25is $as->molecule, 'mRNA',$as->accession_number; 26is $as->alphabet, 'dna'; 27is($as->primary_id, 3598416); 28my @class = $as->species->classification; 29is $class[$#class],'Eukaryota'; 30 31$ast = Bio::SeqIO->new(-format => 'gbdriver', 32 -verbose => $verbose, 33 -file => test_input_file("NT_021877.gbk")); 34$ast->verbose($verbose); 35$as = $ast->next_seq(); 36is $as->molecule, 'DNA',$as->accession_number; 37is $as->alphabet, 'dna'; 38is($as->primary_id, 37539616); 39is($as->accession_number, 'NT_021877'); 40 41my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures(); 42is(($cds->get_tag_values('transl_except'))[1], 43 '(pos:complement(4224..4226),aa:OTHER)'); 44 45# test for a DBSOURCE line 46$ast = Bio::SeqIO->new(-format => 'gbdriver', 47 -verbose => $verbose, 48 -file => test_input_file("BAB68554.gb")); 49$ast->verbose($verbose); 50$as = $ast->next_seq(); 51is $as->molecule, 'linear',$as->accession_number;; 52is $as->alphabet, 'protein'; 53# Though older GenBank releases indicate SOURCE contains only the common name, 54# this is no longer true. In general, this line will contain an abbreviated 55# form of the full organism name (but may contain the full length name), 56# as well as the optional common name and organelle. There is no get/set 57# for the abbreviated name but it is accessible via name() 58ok defined($as->species->name('abbreviated')->[0]); 59is $as->species->name('abbreviated')->[0], 'Aldabra giant tortoise'; 60is($as->primary_id, 15824047); 61my $ac = $as->annotation; 62ok defined $ac; 63my @dblinks = $ac->get_Annotations('dblink'); 64is(scalar @dblinks,1); 65is($dblinks[0]->database, 'GenBank'); 66is($dblinks[0]->primary_id, 'AB072353'); 67is($dblinks[0]->version, '1'); 68is($dblinks[0]->display_text, 'GenBank:AB072353.1','operator overloading in AnnotationI is deprecated'); 69 70# test for multi-line SOURCE 71$ast = Bio::SeqIO->new(-format => 'gbdriver', 72 -verbose => $verbose, 73 -file => test_input_file("NC_006346.gb")); 74$as = $ast->next_seq; 75is $as->species->binomial('FULL'), 'Bolitoglossa n. sp. RLM-2004',$as->accession_number;; 76@class = $as->species->classification; 77is($class[$#class],'Eukaryota'); 78is($as->species->common_name,'mushroomtongue salamander'); 79 80$ast = Bio::SeqIO->new(-format => 'gbdriver', 81 -verbose => $verbose, 82 -file => test_input_file("U71225.gb")); 83$as = $ast->next_seq; 84@class = $as->species->classification; 85is($class[$#class],'Eukaryota',$as->accession_number); 86is $as->species->common_name,'black-bellied salamander'; 87 88# test for unusual common name 89$ast = Bio::SeqIO->new(-format => 'gbdriver', 90 -verbose => $verbose, 91 -file => test_input_file("AB077698.gb")); 92$as = $ast->next_seq; 93# again, this is not a common name but is in name('abbreviated') 94ok defined($as->species->name('abbreviated')->[0]),$as->accession_number; 95is $as->species->name('abbreviated')->[0],'Homo sapiens cDNA to mRNA'; 96 97# test for common name with parentheses 98$ast = Bio::SeqIO->new(-format => 'gbdriver', 99 -verbose => $verbose, 100 -file => test_input_file("DQ018368.gb")); 101$as = $ast->next_seq; 102is $as->species->scientific_name,'(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata', 103$as->accession_number;; 104 105# test secondary accessions 106my $seqio = Bio::SeqIO->new(-format => 'gbdriver', 107 -verbose => $verbose, 108 -file => test_input_file('D10483.gbk')); 109my $seq = $seqio->next_seq; 110my @kw = $seq->get_keywords; 111is(scalar @kw, 118, $seq->accession_number); 112is($kw[-1], 'yabO'); 113my @sec_acc = $seq->get_secondary_accessions(); 114is(scalar @sec_acc,14); 115is($sec_acc[-1], 'X56742'); 116 117# bug #1487 118my $str = Bio::SeqIO->new(-verbose => $verbose, 119 -file => test_input_file('D12555.gbk')); 120eval { 121 $seq = $str->next_seq; 122}; 123 124ok(! $@, 'bug 1487'); 125 126# bug 1647 rpt_unit sub-feature with multiple parens 127$str = Bio::SeqIO->new(-format => 'gbdriver', 128 -verbose => $verbose, 129 -file => test_input_file('mini-AE001405.gb')); 130ok($seq = $str->next_seq); 131my @rpts = grep { $_->primary_tag eq 'repeat_region' } 132 $seq->get_SeqFeatures; 133is $#rpts, 2, 'bug 1647'; 134my @rpt_units = grep {$_->has_tag('rpt_unit')} @rpts; 135is $#rpt_units, 0; 136is(($rpt_units[0]->get_tag_values('rpt_unit'))[0],'(TG)10;A;(TG)7'); 137 138# test bug #1673 , RDB-II genbank files 139$str = Bio::SeqIO->new(-format => 'gbdriver', 140 -verbose => $verbose, 141 -file => test_input_file('Mcjanrna_rdbII.gbk') 142 ); 143ok($seq = $str->next_seq, 'bug 1673'); 144my @refs = $seq->annotation->get_Annotations('reference'); 145is(@refs, 1); 146is($seq->display_id,'Mc.janrrnA'); 147is($seq->molecule ,'RNA'); 148 149$str = Bio::SeqIO->new(-format => 'gbdriver', 150 -file => test_input_file("AF165282.gb"), 151 -verbose => $verbose); 152$seq = $str->next_seq; 153my @features = $seq->all_SeqFeatures(); 154is(@features, 5, $seq->accession_number); 155is($features[0]->start, 1); 156is($features[0]->end, 226); 157my $location = $features[1]->location; 158ok($location->isa('Bio::Location::SplitLocationI')); 159my @sublocs = $location->sub_Location(); 160is(@sublocs, 29); 161 162# version and primary ID - believe it or not, this wasn't working 163is ($seq->version, 1); 164is ($seq->seq_version, 1); 165is ($seq->primary_id, "5734104"); 166 167# streaming and Bio::RichSeq creation 168my $stream = Bio::SeqIO->new(-file => test_input_file("test.genbank"), 169 -verbose => $verbose, 170 -format => 'gbdriver'); 171$stream->verbose($verbose); 172my $seqnum = 0; 173my $species; 174my @cl; 175my $lasts; 176my @ids = qw(DDU63596 DDU63595 HUMBDNF); 177my @tids = (44689, 44689, 9606); 178my @tnames = ("Dictyostelium discoideum","Dictyostelium discoideum", 179 "Homo sapiens"); 180while($seq = $stream->next_seq()) { 181 if($seqnum < 3) { 182 is $seq->display_id(), $ids[$seqnum]; 183 $species = $seq->species(); 184 @cl = $species->classification(); 185 is( $species->binomial(), $tnames[$seqnum], 186 'species parsing incorrect for genbank'); 187 is( $cl[3] ne $species->genus(), 1, 188 'genus duplicated in genbank parsing'); 189 is( $species->ncbi_taxid, $tids[$seqnum] ); 190 } 191 $seqnum++; 192 $lasts = $seq; 193} 194is($seqnum, 5,'streaming'); 195is $lasts->display_id(), "HUMBETGLOA"; 196my ($ref) = $lasts->annotation->get_Annotations('reference'); 197is($ref->medline, 94173918); 198$stream->close(); 199 200$stream = Bio::SeqIO->new(-file => test_input_file("test.genbank.noseq"), 201 -verbose => $verbose, 202 -format => 'gbdriver' ); 203$seqnum = 0; 204while($seq = $stream->next_seq()) { 205 if($seqnum < 3) { 206 is $seq->display_id(), $ids[$seqnum]; 207 } elsif( $seq->display_id eq 'M37762') { 208 is( ($seq->get_keywords())[0], 'neurotrophic factor'); 209 } 210 $seqnum++; 211} 212is $seqnum, 5, "Total number of sequences in test file"; 213 214# fuzzy 215$seq = Bio::SeqIO->new( -format => 'gbdriver', 216 -verbose => $verbose, 217 -file =>test_input_file("testfuzzy.genbank")); 218$seq->verbose($verbose); 219ok(defined($as = $seq->next_seq())); 220 221@features = $as->all_SeqFeatures(); 222is(@features,21,'Fuzzy in'); 223my $lastfeature = pop @features; 224# this is a split location; the root doesn't have strand 225is($lastfeature->strand, undef); 226$location = $lastfeature->location; 227#$location->verbose(-1); # silence the warning of undef seq_id() 228# see above; splitlocs roots do not have a strand really 229is($location->strand, undef); 230is($location->start, 83202); 231is($location->end, 84996); 232 233@sublocs = $location->sub_Location(); 234 235is(@sublocs, 2); 236my $loc = shift @sublocs; 237is($loc->start, 83202); 238is($loc->end, 83329); 239is($loc->strand, -1); 240 241$loc = shift @sublocs; 242is($loc->start, 84248); 243is($loc->end, 84996); 244is($loc->strand,1); 245 246my $outfile = test_output_file(); 247$seq = Bio::SeqIO->new(-format => 'genbank', 248 -verbose => $verbose, 249 -file=> ">$outfile"); 250$seq->verbose($verbose); 251ok($seq->write_seq($as),'Fuzzy out'); 252 253## now genbank ## 254$str = Bio::SeqIO->new(-format =>'gbdriver', 255 -verbose => $verbose, 256 -file => test_input_file('BK000016-tpa.gbk')); 257$seq = $str->next_seq; 258ok(defined $seq, $seq->accession_number); 259ok(defined $seq->seq); 260is($seq->accession_number, 'BK000016',$seq->accession_number); 261is($seq->alphabet, 'dna'); 262is($seq->display_id, 'BK000016'); 263is($seq->length, 1162); 264is($seq->division, 'ROD'); 265is($seq->get_dates, 1); 266is($seq->keywords, 'Third Party Annotation; TPA'); 267is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.'); 268is($seq->seq_version, 1); 269is($seq->feature_count, 2); 270my $spec_obj = $seq->species; 271is ($spec_obj->common_name, 'house mouse'); 272is ($spec_obj->species, 'musculus'); 273is ($spec_obj->genus, 'Mus'); 274is ($spec_obj->binomial, 'Mus musculus'); 275$ac = $seq->annotation; 276my $reference = ($ac->get_Annotations('reference') )[0]; 277is ($reference->pubmed, '11479594'); 278is ($reference->medline, '21372465',$seq->accession_number); 279 280# validate that what is written is what is read 281my $testfile = test_output_file(); 282my $out = Bio::SeqIO->new(-file => ">$testfile", 283 -format => 'genbank'); 284$out->write_seq($seq); 285$out->close(); 286 287$str = Bio::SeqIO->new(-format =>'gbdriver', 288 -file => $testfile); 289$seq = $str->next_seq; 290ok(defined $seq,'roundtrip'); 291ok(defined $seq->seq); 292is($seq->accession_number, 'BK000016'); 293is($seq->alphabet, 'dna'); 294is($seq->display_id, 'BK000016'); 295is($seq->length, 1162); 296is($seq->division, 'ROD'); 297is($seq->get_dates, 1); 298is($seq->keywords, 'Third Party Annotation; TPA'); 299is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.'); 300is($seq->seq_version, 1); 301is($seq->feature_count, 2); 302$spec_obj = $seq->species; 303is ($spec_obj->common_name, 'house mouse'); 304is ($spec_obj->species, 'musculus'); 305is ($spec_obj->genus, 'Mus'); 306is ($spec_obj->binomial, 'Mus musculus'); 307$ac = $seq->annotation; 308$reference = ($ac->get_Annotations('reference') )[0]; 309is ($reference->pubmed, '11479594'); 310is ($reference->medline, '21372465'); 311 312# write revcomp split location 313my $gb = Bio::SeqIO->new(-format => 'gbdriver', 314 -verbose => $verbose, 315 -file => test_input_file('revcomp_mrna.gb')); 316$seq = $gb->next_seq(); 317 318$gb = Bio::SeqIO->new(-format => 'genbank', 319 -file => ">$testfile"); 320 321$gb->write_seq($seq); 322undef $gb; 323ok(! -z $testfile, 'revcomp split location'); 324 325# bug 1925, continuation of long ORGANISM line ends up in @classification: 326# ORGANISM Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 327# 9150 328# Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 329# Enterobacteriaceae; Salmonella. 330$gb = Bio::SeqIO->new(-format => 'gbdriver', 331 -verbose => $verbose, 332 -file => test_input_file('NC_006511-short.gbk')); 333$seq = $gb->next_seq; 334is $seq->species->common_name, undef, "Bug 1925"; 335is $seq->species->scientific_name, "Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150"; 336@class = $seq->species->classification; 337is $class[$#class], "Bacteria"; 338 339# WGS tests 340$gb = Bio::SeqIO->new(-format => 'gbdriver', 341 -verbose => $verbose, 342 -file => test_input_file('O_sat.wgs')); 343$seq = $gb->next_seq; 344 345my @tests = ('wgs' => 'AAAA02000001-AAAA02050231', 346 'wgs_scafld' => 'CM000126-CM000137', 347 'wgs_scafld' => 'CH398081-CH401163'); 348 349my @wgs = map {$seq->annotation->get_Annotations(lc($_))} (qw(WGS WGS_SCAFLD)); 350 351my $ct=0; 352 353for my $wgs (@wgs) { 354 my ($tagname, $value) = (shift @tests, shift @tests); 355 is($wgs->tagname, $tagname, $tagname); 356 is($wgs->value, $value); 357 $ct++; 358} 359 360is ($ct, 3); 361 362# make sure we can retrieve a feature with a primary tag of 'misc_difference' 363$gb = Bio::SeqIO->new(-format => 'gbdriver', 364 -verbose => $verbose, 365 -file => test_input_file('BC000007.gbk')); 366$seq = $gb->next_seq; 367($cds) = grep { $_->primary_tag eq 'misc_difference' } $seq->get_SeqFeatures; 368my @vals = $cds->get_tag_values('gene'); 369is $vals[0], 'PX19', $seq->accession_number; 370 371# Check that the source,organism section is identical between input and output. 372# - test an easy one where organism is species, then two different formats of 373# subspecies, then a species with a format that used to be mistaken for 374# subspecies, then a bacteria with no genus, and finally a virus with a genus. 375 376# These tests are now somewhat out-of-date since we are moving to a Bio::Taxon- 377# based system for verifying taxonomic information. Right now they just verify 378# changes so are really useless; I will change them to verify common name, 379# organelle, scientific name, etc. 380 381 382# output always adds a period (GenBank std), but two of these files do not use them. 383 384foreach my $in ('BK000016-tpa.gbk', 'ay116458.gb', 'ay149291.gb', 'NC_006346.gb', 'ay007676.gb', 'dq519393.gb') { 385 my $infile = test_input_file($in); 386 $outfile = test_output_file(); 387 388 $str = Bio::SeqIO->new(-format =>'genbank', 389 -verbose => $verbose, 390 -file => $infile); 391 $seq = $str->next_seq; 392 393 $out = Bio::SeqIO->new(-file => $outfile, -format => 'genbank'); 394 $out->write_seq($seq); 395 $out->close(); 396 397 open my $IN, '<', $infile or die "Could not read file '$infile': $!\n"; 398 my @in = <$IN>; 399 close $IN; 400 open my $RESULT, '<', $outfile or die "Could not read file '$outfile': $!\n"; 401 my $line = 0; 402 my $check = 0; 403 my $is = 1; 404 405 FILECHECK: 406 while (my $result = <$RESULT>) { 407 if ($result =~ /^KEYWORDS/) { 408 $check = 1; 409 next; 410 } 411 412 if ($result =~ /^REFERENCE/) { 413 last FILECHECK; 414 } 415 416 if ($check) { 417 418 # end periods don't count (not all input files have them) 419 $result =~ s{\.$}{}; 420 $in[$line] =~ s{\.$}{}; 421 422 if ($result ne $in[$line]) { 423 $is = 0; 424 last; 425 } 426 } 427 } continue { $line++ } 428 close $RESULT; 429 430 ok $is, $in; 431} 432 433# NB: there should probably be full testing on all lines to ensure that output 434# matches input. 435 436# 20061117: problem with *double* colon in some annotation-dblink values 437$ct = 0; 438 439foreach my $in ('P35527.gb') { 440 my $infile = test_input_file($in); 441 $str = Bio::SeqIO->new(-format =>'genbank', 442 -verbose => $verbose, 443 -file => $infile); 444 $seq = $str->next_seq; 445 my $ac = $seq->annotation(); # Bio::AnnotationCollection 446 foreach my $key ($ac->get_all_annotation_keys() ) { 447 my @values = $ac->get_Annotations($key); 448 foreach my $ann (@values) { 449 my $value = $ann->display_text; 450 $ct++; 451 if ($key eq 'dblink') { 452 453 ok (index($value,'::') < 0); # this should never be true 454 455 ok ($value, $value); # check value is not empty 456 457 # print " ann/", sprintf('%12s ',$key), '>>>', $value , '<<<', "\n"; 458 # print " index double colon: ",index($value ,'::'), "\n"; 459 460 # check db name: 461 my @parts = split(/:/,$value); 462 if ( $parts[0] =~ /^(?: 463 # not an exhaustive list of databases; 464 # just the db's referenced in P35527.gb: 465 swissprot | GenBank | GenPept | HSSP| IntAct | Ensembl | KEGG | HGNC | MIM | ArrayExpress 466 | GO | InterPro | Pfam| PRINTS | PROSITE 467 )$/x ) 468 { 469 ok 1; 470 } 471 else { 472 ok 0; 473 } 474 ok ( $parts[1], "$parts[0]" ); 475 } 476 # elsif ($key eq 'reference') { } 477 } 478 } 479} 480 481is($ct, 46); 482 483# bug 2195 484 485$str = Bio::SeqIO->new(-format =>'gbdriver', 486 -verbose => $verbose, 487 -file => test_input_file('AF305198.gb') 488 ); 489 490$species = $str->next_seq->species; 491 492is($species->scientific_name, 'Virginia creeper phytoplasma', 'Bug 2195'); 493is(join(', ',$species->classification), 'Virginia creeper phytoplasma, '. 494 '16SrV (Elm yellows group), Candidatus Phytoplasma, '. 495 'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '. 496 'Firmicutes, Bacteria', 'Bug 2195'); 497 498# bug 2569, PROJECT line support, read and write, round-tripping 499 500$str = Bio::SeqIO->new(-format =>'gbdriver', 501 -verbose => $verbose, 502 -file => test_input_file('NC_008536.gb')); 503 504$seq = $str->next_seq; 505 506my $project = ($seq->annotation->get_Annotations('project'))[0]; 507 508isa_ok($project, 'Bio::Annotation::SimpleValue'); 509 510if ($project) { 511 is($project->value, 'GenomeProject:12638'); 512} else { 513 ok(0, "PROJECT not parsed"); 514} 515 516$outfile = test_output_file(); 517 518$gb = Bio::SeqIO->new(-format => 'genbank', 519 -verbose => $verbose, 520 -file=> ">$outfile"); 521 522$gb->write_seq($seq); 523 524$str = Bio::SeqIO->new(-format =>'gbdriver', 525 -verbose => $verbose, 526 -file => $outfile); 527 528$seq = $str->next_seq; 529 530$project = ($seq->annotation->get_Annotations('project'))[0]; 531 532isa_ok($project, 'Bio::Annotation::SimpleValue'); 533 534if ($project) { 535 is($project->value, 'GenomeProject:12638'); 536} else { 537 ok(0, "Roundtrip test failed"); 538} 539 540################################## EMBL ################################## 541 542# Set to -1 for release version, so warnings aren't printed 543 544$ast = Bio::SeqIO->new( -format => 'embldriver', 545 -verbose => $verbose, 546 -file => test_input_file("roa1.dat")); 547$ast->verbose($verbose); 548$as = $ast->next_seq(); 549ok defined $as->seq; 550is($as->display_id, 'HSHNCPA1'); 551is($as->accession_number, 'X79536'); 552is($as->seq_version, 1); 553is($as->version, 1); 554is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1'); 555is($as->molecule, 'RNA'); 556is($as->alphabet, 'rna'); 557is(scalar $as->all_SeqFeatures(), 4); 558is($as->length, 1198); 559is($as->species->binomial(), 'Homo sapiens'); 560is($as->get_dates, 2); 561 562# EMBL Release 87 changes (8-17-06) 563 564$ast = Bio::SeqIO->new( -format => 'embldriver', 565 -verbose => $verbose, 566 -file => test_input_file("roa1_v2.dat")); 567$ast->verbose($verbose); 568$as = $ast->next_seq(); 569ok defined $as->seq; 570# accession # same as display name now 571is($as->display_id, 'X79536'); 572is($as->get_dates, 2); 573is($as->accession_number, 'X79536'); 574is($as->seq_version, 1); 575is($as->version, 1); 576is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1'); 577# mRNA instead of RNA 578is($as->molecule, 'mRNA'); 579is($as->alphabet, 'rna'); 580is(scalar $as->all_SeqFeatures(), 4); 581is($as->length, 1198); 582is($as->species->binomial(), 'Homo sapiens'); 583 584my $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"), 585 -format => 'embldriver'); 586$seq = $ent->next_seq(); 587 588is(defined $seq->seq(), 1, 589 'success reading Embl with ^ location and badly split double quotes'); 590is(scalar $seq->annotation->get_Annotations('reference'), 3); 591is($seq->get_dates, 0); 592 593$out = Bio::SeqIO->new(-file=> ">$outfile", 594 -format => 'embl'); 595is($out->write_seq($seq),1, 596 'success writing Embl format with ^ < and > locations'); 597 598# embl with no FT 599$ent = Bio::SeqIO->new( -file => test_input_file("test.embl"), 600 -format => 'embldriver'); 601$seq = $ent->next_seq(); 602 603ok($seq); 604is(lc($seq->subseq(1,10)),'gatcagtaga'); 605is($seq->length, 4870); 606 607# embl with no FH 608my $noFH = Bio::SeqIO->new(-file => test_input_file("no_FH.embl"), 609 -format => 'embldriver'); 610$seq = $noFH->next_seq; 611is(scalar($seq->get_SeqFeatures), 4); 612is($seq->display_id, 'AE000001'); 613is($seq->get_dates, 0); 614 615# bug 1571 616$ent = Bio::SeqIO->new(-format => 'embldriver', 617 -file => test_input_file('test.embl2sq')); 618$seq = $ent->next_seq; 619is($seq->length,4870); 620is($seq->get_dates, 0); 621 622# embl repbase 623$ent = Bio::SeqIO->new(-file => test_input_file("BEL16-LTR_AG.embl"), -format => 'embldriver'); 624$seq = $ent->next_seq; 625is($seq->display_id,'BEL16-LTR_AG'); 626is($seq->get_dates, 2); 627 628# test secondary accessions in EMBL (bug #1332) 629$seqio = Bio::SeqIO->new(-format => 'embldriver', 630 -file => test_input_file('ECAPAH02.embl')); 631$seq = $seqio->next_seq; 632is($seq->accession_number, 'D10483'); 633is($seq->seq_version, 2); 634my @accs = $seq->get_secondary_accessions(); 635is($accs[0], 'J01597'); 636is($accs[-1], 'X56742'); 637is($seq->get_dates, 2); 638 639### TPA TESTS - Thanks to Richard Adams ### 640# test Third Party Annotation entries in EMBL/Gb format 641# to ensure compatability with parsers. 642$str = Bio::SeqIO->new(-verbose => $verbose, 643 -format =>'embldriver', 644 -file => test_input_file('BN000066-tpa.embl')); 645$seq = $str->next_seq; 646ok(defined $seq); 647is($seq->accession_number, 'BN000066'); 648is($seq->alphabet, 'dna'); 649is($seq->display_id, 'AGA000066'); 650is($seq->length, 5195); 651is($seq->division, 'INV'); 652is($seq->keywords, 'acetylcholinesterase; achE1 gene; Third Party Annotation; TPA'); 653is($seq->seq_version, 1); 654is($seq->feature_count, 15); 655is($seq->get_dates, 2); 656 657$spec_obj = $seq->species; 658is ($spec_obj->common_name, 'African malaria mosquito'); 659is ($spec_obj->species, 'gambiae'); 660is ($spec_obj->genus, 'Anopheles'); 661is ($spec_obj->binomial, 'Anopheles gambiae'); 662 663$ac = $seq->annotation; 664$reference = ($ac->get_Annotations('reference') )[1]; 665is ($reference->title,'"A novel acetylcholinesterase gene in mosquitoes codes for the insecticide target and is non-homologous to the ace gene in Drosophila"'); 666is ($reference->authors,'Weill M., Fort P., Berthomi eu A., Dubois M.P., Pasteur N., Raymond M.'); 667my $cmmnt = ($ac->get_Annotations('comment') )[0]; 668is($cmmnt->text, 'see also AJ488492 for achE-1 from Kisumu strain Third Party Annotation Database: This TPA record uses Anopheles gambiae trace archive data (http://trace.ensembl.org)'); 669 670 671$ent = Bio::SeqIO->new( -file => test_input_file("test.embl"), 672 -format => 'embldriver'); 673$ent->verbose($verbose); 674$seq = $ent->next_seq(); 675$species = $seq->species(); 676@cl = $species->classification(); 677is( $cl[3] ne $species->genus(), 1, 'genus duplication test'); 678$ent->close(); 679 680# 681## read-write - test embl writing of a PrimarySeq 682# 683my $primaryseq = Bio::PrimarySeq->new( -seq => 'AGAGAGAGATA', 684 -id => 'myid', 685 -desc => 'mydescr', 686 -alphabet => 'DNA', 687 -accession_number => 'myaccession'); 688 689$verbose = -1 unless $ENV{'BIOPERLDEBUG'}; # silence warnings unless we are debuggin 690 691my $embl = Bio::SeqIO->new(-format => 'embl', 692 -verbose => $verbose, 693 -file => ">$outfile"); 694 695ok($embl->write_seq($primaryseq)); 696 697# this should generate a warning 698my $scalar = "test"; 699eval { 700 $embl->write_seq($scalar); 701}; 702ok ($@); 703 704############################## Swiss/UniProt ############################## 705 706$seqio = Bio::SeqIO->new( -verbose => $verbose, 707 -format => 'swissdriver', 708 -file => test_input_file('test.swiss')); 709 710isa_ok($seqio, 'Bio::SeqIO'); 711$seq = $seqio->next_seq; 712my @gns = $seq->annotation->get_Annotations('gene_name'); 713 714$outfile = test_output_file(); 715$seqio = Bio::SeqIO->new( -verbose => $verbose, 716 -format => 'swiss', 717 -file => ">$outfile"); 718 719$seqio->write_seq($seq); 720 721# reads it in once again 722$seqio = Bio::SeqIO->new( -verbose => $verbose, 723 -format => 'swissdriver', 724 -file => $outfile); 725 726$seq = $seqio->next_seq; 727isa_ok($seq->species, 'Bio::Species'); 728is($seq->species->ncbi_taxid, 6239); 729 730# version, seq_update, dates (5 tests) 731is($seq->version, 40); 732my ($ann) = $seq->annotation->get_Annotations('seq_update'); 733eval {is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated')}; 734ok(!$@); 735 736my @dates = $seq->get_dates; 737my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001); 738 739for my $date (@dates) { 740 my $expdate = shift @date_check; 741 is($date, $expdate,'dates'); 742} 743 744my @gns2 = $seq->annotation->get_Annotations('gene_name'); 745# check gene name is preserved (was losing suffix in worm gene names) 746ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value); 747 748# test swissprot multiple RP lines 749$str = Bio::SeqIO->new(-file => test_input_file('P33897')); 750$seq = $str->next_seq; 751isa_ok($seq, 'Bio::Seq::RichSeqI'); 752@refs = $seq->annotation->get_Annotations('reference'); 753is( @refs, 23); 754is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.'); 755 756# version, seq_update, dates (5 tests) 757is($seq->version, 44); 758($ann) = $seq->annotation->get_Annotations('seq_update'); 759is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated'); 760@dates = $seq->get_dates; 761@date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004); 762for my $date (@dates) { 763 is($date, shift @date_check); 764} 765 766$ast = Bio::SeqIO->new(-verbose => $verbose, 767 -format => 'swissdriver' , 768 -file => test_input_file("roa1.swiss")); 769$as = $ast->next_seq(); 770 771ok defined $as->seq; 772is($as->id, 'ROA1_HUMAN', "id is ".$as->id); 773like($as->primary_id, qr(Bio::PrimarySeq)); 774is($as->length, 371); 775is($as->alphabet, 'protein'); 776is($as->division, 'HUMAN'); 777is(scalar $as->all_SeqFeatures(), 16); 778is(scalar $as->annotation->get_Annotations('reference'), 11); 779 780# version, seq_update, dates (5 tests) 781is($as->version, 35); 782($ann) = $as->annotation->get_Annotations('seq_update'); 783is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated'); 784@dates = $as->get_dates; 785@date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997); 786for my $date (@dates) { 787 is($date, shift @date_check); 788} 789 790($ent,$out) = undef; 791($as,$seq) = undef; 792 793$seqio = Bio::SeqIO->new(-format => 'swissdriver' , 794 -verbose => $verbose, 795 -file => test_input_file("swiss.dat")); 796$seq = $seqio->next_seq; 797isa_ok($seq, 'Bio::Seq::RichSeqI'); 798 799# more tests to verify we are actually parsing correctly 800like($seq->primary_id, qr(Bio::PrimarySeq)); 801is($seq->display_id, 'MA32_HUMAN'); 802is($seq->length, 282); 803is($seq->division, 'HUMAN'); 804is($seq->alphabet, 'protein'); 805my @f = $seq->all_SeqFeatures(); 806is(@f, 2); 807is($f[1]->primary_tag, 'CHAIN'); 808is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN'); 809 810# version, seq_update, dates (5 tests) 811is($seq->version, 40); 812($ann) = $seq->annotation->get_Annotations('seq_update'); 813is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated'); 814@dates = $seq->get_dates; 815@date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000); 816for my $date (@dates) { 817 is($date, shift @date_check); 818} 819 820my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP); 821($ann) = $seq->annotation->get_Annotations('gene_name'); 822# use Data::Stag findval and element name to get values/nodes 823foreach my $gn ( $ann->findval('Name') ) { 824 ok ($gn, shift(@genenames)); 825} 826foreach my $gn ( $ann->findval('Synonyms') ) { 827 ok ($gn, shift(@genenames)); 828} 829like($ann->value, qr/Name: GC1QBP/); 830 831# test for feature locations like ?..N 832$seq = $seqio->next_seq(); 833isa_ok($seq, 'Bio::Seq::RichSeqI'); 834like($seq->primary_id, qr(Bio::PrimarySeq)); 835is($seq->display_id, 'ACON_CAEEL'); 836is($seq->length, 788); 837is($seq->division, 'CAEEL'); 838is($seq->alphabet, 'protein'); 839is(scalar $seq->all_SeqFeatures(), 5); 840 841foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) { 842 ok ($gn->value, 'F54H12.1'); 843} 844 845# test species in swissprot -- this can be a n:n nightmare 846$seq = $seqio->next_seq(); 847isa_ok($seq, 'Bio::Seq::RichSeqI'); 848like($seq->primary_id, qr(Bio::PrimarySeq)); 849@sec_acc = $seq->get_secondary_accessions(); 850is($sec_acc[0], 'P29360'); 851is($sec_acc[1], 'Q63631'); 852is($seq->accession_number, 'P42655'); 853@kw = $seq->get_keywords; 854is( $kw[0], 'Brain'); 855is( $kw[1], 'Neurone'); 856is($kw[3], 'Multigene family'); 857is($seq->display_id, '143E_HUMAN'); 858 859# hybrid names from old sequences are no longer valid, these are chopped 860# off at the first organism 861is($seq->species->binomial, "Homo sapiens"); 862is($seq->species->common_name, "Human"); 863is($seq->species->ncbi_taxid, 9606); 864 865$seq = $seqio->next_seq(); 866isa_ok($seq, 'Bio::Seq::RichSeqI'); 867like($seq->primary_id, qr(Bio::PrimarySeq)); 868is($seq->species->binomial, "Bos taurus"); 869is($seq->species->common_name, "Bovine"); 870is($seq->species->ncbi_taxid, 9913); 871 872# multiple genes in swissprot 873$seq = $seqio->next_seq(); 874isa_ok($seq, 'Bio::Seq::RichSeqI'); 875like($seq->primary_id, qr(Bio::PrimarySeq)); 876 877($ann) = $seq->annotation->get_Annotations("gene_name"); 878@genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC); 879my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)"; 880 881my @names = @genenames; # copy array 882 883my @ann_names = $ann->get_all_values(); 884is(scalar(@ann_names), scalar(@names)); 885 886# do this in a layered way (nested tags) 887for my $node ($ann->findnode('gene_name')) { 888 for my $name ($node->findval('Name')) { 889 is($name, shift(@names)); 890 } 891 for my $name ($node->findval('Synonyms')) { 892 is($name, shift(@names)); 893 } 894} 895 896is(scalar(@names),0); 897 898# same entry as before, but with the new gene names format 899$seqio = Bio::SeqIO->new(-format => 'swissdriver', 900 -verbose => $verbose, 901 -file => test_input_file("calm.swiss")); 902$seq = $seqio->next_seq(); 903isa_ok($seq, 'Bio::Seq::RichSeqI'); 904like($seq->primary_id, qr(Bio::PrimarySeq)); 905 906($ann) = $seq->annotation->get_Annotations("gene_name"); 907@names = @genenames; # copy array 908 909my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array 910is(scalar(@ann_names2), scalar(@names)); 911 912for my $node ($ann->findnode('gene_name')) { 913 for my $name ($node->findval('Name')) { 914 is($name, shift(@names)); 915 } 916 for my $name ($node->findval('Synonyms')) { 917 is($name, shift(@names)); 918 } 919} 920 921is(scalar(@names),0); 922 923# test proper parsing of references 924my @litrefs = $seq->annotation->get_Annotations('reference'); 925is(scalar(@litrefs), 17); 926 927my @titles = ( 928 '"Complete amino acid sequence of human brain calmodulin."', 929 '"Multiple divergent mRNAs code for a single human calmodulin."', 930 '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."', 931 '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."', 932 '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."', 933 undef, 934 '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."', 935 '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."', 936 '"The DNA sequence and analysis of human chromosome 14."', 937 '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."', 938 '"Alpha-helix nucleation by a calcium-binding peptide loop."', 939 '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."', 940 '"Calmodulin structure refined at 1.7 A resolution."', 941 '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."', 942 '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."', 943 '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."', 944 '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."', 945); 946 947my @locs = ( 948 "Biochemistry 21:2565-2569(1982).", 949 "J. Biol. Chem. 263:17055-17062(1988).", 950 "J. Biol. Chem. 262:16663-16670(1987).", 951 "Biochem. Int. 9:177-185(1984).", 952 "Eur. J. Biochem. 225:71-82(1994).", 953 "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.", 954 "Cell Calcium 23:323-338(1998).", 955 "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.", 956 "Nature 421:601-607(2003).", 957 "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).", 958 "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).", 959 "Nat. Struct. Biol. 8:990-997(2001).", 960 "J. Mol. Biol. 228:1177-1192(1992).", 961 "Biochemistry 33:15259-15265(1994).", 962 "Nature 415:396-402(2002).", 963 "EMBO J. 21:6721-6732(2002).", 964 "Nat. Struct. Biol. 10:226-231(2003).", 965); 966 967my @positions = ( 968 undef, undef, 969 undef, undef, 970 undef, undef, 971 undef, undef, 972 undef, undef, 973 undef, undef, 974 undef, undef, 975 undef, undef, 976 undef, undef, 977 undef, undef, 978 94, 103, 979 1, 76, 980 undef, undef, 981 undef, undef, 982 5, 148, 983 1, 148, 984 undef, undef, 985); 986 987foreach my $litref (@litrefs) { 988 is($litref->title, shift(@titles)); 989 is($litref->location, shift(@locs)); 990 is($litref->start, shift(@positions)); 991 is($litref->end, shift(@positions)); 992} 993 994# format parsing changes (pre-rel 9.0) 995 996$seqio = Bio::SeqIO->new( -verbose => $verbose, 997 -format => 'swissdriver', 998 -file => test_input_file('pre_rel9.swiss')); 999 1000ok($seqio); 1001$seq = $seqio->next_seq; 1002isa_ok($seq->species, 'Bio::Species'); 1003is($seq->species->ncbi_taxid, "6239"); 1004 1005# version, seq_update, dates (5 tests) 1006is($seq->version, 44); 1007($ann) = $seq->annotation->get_Annotations('seq_update'); 1008is($ann->display_text, 1); 1009@dates = $seq->get_dates; 1010@date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 ); 1011for my $date (@dates) { 1012 is($date, shift @date_check); 1013} 1014 1015my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052 1016 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090 1017 IPR006092 IPR009075 IPR009100 IPR013764 PF00441 1018 PF02770 PF02771 PS00072 PS00073); 1019 1020for my $dblink ( $seq->annotation->get_Annotations('dblink') ) { 1021 is($dblink->primary_id, shift @idcheck); 1022} 1023 1024$seqio = Bio::SeqIO->new( -verbose => $verbose, 1025 -format => 'swissdriver', 1026 -file => test_input_file('pre_rel9.swiss')); 1027 1028my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL); 1029 1030while (my $seq = $seqio->next_seq) { 1031 is($seq->namespace, shift @namespaces); 1032} 1033 1034# format parsing changes (rel 9.0, Oct 2006) 1035 1036$seqio = Bio::SeqIO->new( -verbose => $verbose, 1037 -format => 'swissdriver', 1038 -file => test_input_file('rel9.swiss')); 1039 1040ok($seqio); 1041$seq = $seqio->next_seq; 1042isa_ok($seq->species, 'Bio::Species'); 1043is($seq->species->ncbi_taxid, 6239); 1044 1045is($seq->version, 47); 1046($ann) = $seq->annotation->get_Annotations('seq_update'); 1047is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated'); 1048@dates = $seq->get_dates; 1049@date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 ); 1050for my $date (@dates) { 1051 is($date, shift @date_check); 1052} 1053 1054@idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7 1055 WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090 1056 IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770 1057 PF02771 PS00072 PS00073 ); 1058 1059for my $dblink ( $seq->annotation->get_Annotations('dblink') ) { 1060 is($dblink->primary_id, shift @idcheck); 1061} 1062 1063$seqio = Bio::SeqIO->new( -verbose => $verbose, 1064 -format => 'swissdriver', 1065 -file => test_input_file('rel9.swiss')); 1066 1067@namespaces = qw(Swiss-Prot TrEMBL TrEMBL); 1068 1069while (my $seq = $seqio->next_seq) { 1070 is($seq->namespace, shift @namespaces); 1071} 1072 1073# bug 2288 1074# Q8GBD3.swiss 1075$seqio = Bio::SeqIO->new( -verbose => $verbose, 1076 -format => 'swiss', 1077 -file => test_input_file('Q8GBD3.swiss')); 1078 1079while (my $seq = $seqio->next_seq) { 1080 my $lineage = join(';', $seq->species->classification); 1081 is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'. 1082 'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'. 1083 'Proteobacteria;Bacteria'); 1084} 1085 1086# test for GenBank swissprot/UniProt/UniProtKB DBSOURCE line (Bug : RT 44536) 1087$ast = Bio::SeqIO->new(-format => 'genbank', 1088 -verbose => $verbose, 1089 -file => test_input_file('P39765.gb')); 1090$ast->verbose($verbose); 1091$as = $ast->next_seq(); 1092is $as->molecule, 'PRT',$as->accession_number;; 1093is $as->alphabet, 'protein'; 1094# Though older GenBank releases indicate SOURCE contains only the common name, 1095# this is no longer true. In general, this line will contain an abbreviated 1096# form of the full organism name (but may contain the full length name), 1097# as well as the optional common name and organelle. There is no get/set 1098# for the abbreviated name but it is accessible via name() 1099ok defined($as->species->name('abbreviated')->[0]); 1100is $as->species->name('abbreviated')->[0], 'Bacillus subtilis'; 1101is($as->primary_id, 20141743); 1102$ac = $as->annotation; 1103ok defined $ac; 1104@dblinks = $ac->get_Annotations('dblink'); 1105is(scalar @dblinks,31); 1106is($dblinks[0]->database, 'UniProtKB'); 1107is($dblinks[0]->primary_id, 'PYRR_BACSU'); 1108is($dblinks[0]->version, undef); 1109is($dblinks[0]->display_text, 'UniProtKB:PYRR_BACSU','operator overloading in AnnotationI is deprecated'); 1110