1package Boulder::Unigene; 2# 3use Boulder::Stream; 4require Exporter; 5@ISA = qw(Exporter Boulder::Stream); 6@EXPORT = (); 7@EXPORT_OK = (); 8use Carp; 9$VERSION=1.0; 10use constant DEFAULT_UNIGENE_PATH => '/data/unigene/Hs.dat'; 11 12=head1 NAME 13 14Boulder::Unigene - Fetch Unigene data records as parsed Boulder Stones 15 16=head1 SYNOPSIS 17 18 # parse a file of Unigene records 19 $ug = new Boulder::Unigene(-accessor=>'File', 20 -param => '/data/unigene/Hs.dat'); 21 while (my $s = $ug->get) { 22 print $s->Identifier; 23 print $s->Gene; 24 } 25 26 # parse flatfile records yourself 27 open (UG,"/data/unigene/Hs.dat"); 28 local $/ = "*RECORD*"; 29 while (<UG>) { 30 my $s = Boulder::Unigene->parse($_); 31 # etc. 32 } 33 34=head1 DESCRIPTION 35 36Boulder::Unigene provides retrieval and parsing services for UNIGENE records 37 38Boulder::Unigene provides retrieval and parsing services for NCBI 39Unigene records. It returns Unigene entries in L<Stone> 40format, allowing easy access to the various fields and values. 41Boulder::Unigene is a descendent of Boulder::Stream, and provides a 42stream-like interface to a series of Stone objects. 43 44Access to Unigene is provided by one I<accessors>, which 45give access to local Unigene database. When you 46create a new Boulder::Unigene stream, you provide the 47accessors, along with accessor-specific parameters that control what 48entries to fetch. The accessors is: 49 50=over 2 51 52=item File 53 54This provides access to local Unigene entries by reading from a flat file 55(typically Hs.dat file downloadable from NCBI's Ftp site). 56The stream will return a Stone corresponding to each of the entries in 57the file, starting from the top of the file and working downward. The 58parameter is the path to the local file. 59 60=back 61 62It is also possible to parse a single Unigene entry from a text string 63stored in a scalar variable, returning a Stone object. 64 65=head2 Boulder::Unigene methods 66 67This section lists the public methods that the I<Boulder::Unigene> 68class makes available. 69 70=over 4 71 72=item new() 73 74 # Local fetch via File 75 $ug=new Boulder::Unigene(-accessor => 'File', 76 -param => '/data/unigene/Hs.dat'); 77 78The new() method creates a new I<Boulder::Unigene> stream on the 79accessor provided. The only possible accessors is B<File>. 80If successful, the method returns the stream 81object. Otherwise it returns undef. 82 83new() takes the following arguments: 84 85 -accessor Name of the accessor to use 86 -param Parameters to pass to the accessor 87 88Specify the accessor to use with the B<-accessor> argument. If not 89specified, it defaults to B<File>. 90 91B<-param> is an accessor-specific argument. The possibilities is: 92 93For B<File>, the B<-param> argument must point to a string-valued 94scalar, which will be interpreted as the path to the file to read 95Unigene entries from. 96 97=item get() 98 99The get() method is inherited from I<Boulder::Stream>, and simply 100returns the next parsed Unigene Stone, or undef if there is nothing 101more to fetch. It has the same semantics as the parent class, 102including the ability to restrict access to certain top-level tags. 103 104=item put() 105 106The put() method is inherited from the parent Boulder::Stream class, 107and will write the passed Stone to standard output in Boulder format. 108This means that it is currently not possible to write a 109Boulder::Unigene object back into Unigene flatfile form. 110 111=back 112 113=head1 OUTPUT TAGS 114 115The tags returned by the parsing operation are taken from the names shown in the Flat file 116Hs.dat since no better description of them is provided yet by the database source producer. 117 118=head2 Top-Level Tags 119 120These are tags that appear at the top level of the parsed Unigene 121entry. 122 123=over 4 124 125=item Identifier 126 127The Unigene identifier of this entry. Identifier is a single-value tag. 128 129Example: 130 131 my $identifierNo = $s->Identifier; 132 133=item Title 134 135The Unigene title for this entry. 136 137Example: 138 my $titledef=$s->Title; 139 140=item Gene 141The Gene associated with this Unigene entry 142 143Example: 144 my $thegene=$s->Gene; 145 146=item Cytoband 147The cytological band position of this entry 148 149Example: 150 my $thecytoband=$s->Cytoband; 151 152=item Counts 153The number of EST in this record 154 155Example: 156 my $thecounts=$s->Counts; 157 158=item LocusLink 159The id of the LocusLink entry associated with this record 160 161Example: 162 my $thelocuslink=$s->LocusLink; 163 164=item Chromosome 165This field contains a list, of the chromosomes numbers in which this entry has been linked 166 167Example: 168 my @theChromosome=$s->Chromosome; 169 170=back 171 172=head2 STS 173Multiple records in the form ^STS ACC=XXXXXX NAME=YYYYYY 174 175=over 4 176 177=item ACC 178 179=item NAME 180 181=back 182 183=head2 TXMAP 184Multiple records in the form ^TXMAP XXXXXXX; MARKER=YYYYY; RHPANEL=ZZZZ 185 186The TXMAP tag points to a Stone record that contains multiple 187subtags. Each subtag is the name of a feature which points, in turn, 188to a Stone that describes the feature's location and other attributes. 189 190Each feature will contain one or more of the following subtags: 191 192=over 4 193 194=item MARKER 195 196=item RHPANEL 197 198=back 199 200 201=head2 PROTSIM 202Multiple records in the form ^PROTSIM ORG=XXX; PROTID=DBID:YYY; PCT=ZZZ; ALN=QQQQ 203Where DBID is 204 PID for indicate presence of GenPept identifier, 205 SP to indicate SWISSPROT identifier, 206 PIR to indicate PIR identifier, 207 PRF to indicate ??? 208 209=over 4 210 211=item ORG 212 213=item PROTID 214 215=item PCT 216 217=item ALN 218 219=back 220 221=head2 SEQUENCE 222Multiple records in the form ^SEQUENCE ACC=XXX; NID=YYYY; PID = CLONE= END= LID= 223 224=over 225 226=item ACC 227 228=item NID 229 230=item PID 231 232=item CLONE 233 234=item END 235 236=item LID 237 238=back 239 240=head1 SEE ALSO 241 242L<Boulder>, L<Boulder::Blast>, L<Boulder::Genbank> 243 244=head1 AUTHOR 245 246Lincoln Stein <lstein@cshl.org>. 247Luca I.G. Toldo <luca.toldo@merck.de> 248 249Copyright (c) 1997 Lincoln D. Stein 250Copyright (c) 1999 Luca I.G. Toldo 251 252This library is free software; you can redistribute it and/or modify 253it under the same terms as Perl itself. See DISCLAIMER.txt for 254disclaimers of warranty. 255 256=cut 257 258# 259# Following did not require any changes compared to Genbank.pm 260# 261sub new { 262 my($package,@parameters) = @_; 263 # superclass constructor 264 my($self) = new Boulder::Stream; 265 266 # figure out whether parameters are named. Look for 267 # an initial '-' 268 if ($parameters[0]=~/^-/) { 269 my(%parameters) = @parameters; 270 $self->{'accessor'}=$parameters{'-accessor'} || 'File'; 271 $self->{'param'}=$parameters{'-param'}; 272 $self->{'OUT'}=$parameters{'-out'} || 'main::STDOUT'; 273 } else { 274 $self->{'accessor'}='File'; 275 $self->{'param'}=[@parameters]; 276 } 277 278 croak "Require parameters" unless defined($self->{'param'}); 279 $self->{'accessor'} = new {$self->{'accessor'}}($self->{'param'}); 280 281 return bless $self,$package; 282} 283 284# 285# Following required no changes compared to Genbank.pm 286# 287sub read_record { 288 my($self,@tags) = @_; 289 my($s); 290 291 if (wantarray) { 292 my(@result); 293 while (!$self->{'done'}) { 294 $s = $self->read_one_record(@tags); 295 next unless $s; 296 next if $query && !(&$query); 297 push(@result,$s); 298 } 299 return @result; 300 } 301 302 # we get here if in a scalar context 303 while (!$self->{'done'}) { 304 $s = $self->read_one_record(@tags); 305 next unless $s; 306 return $s unless $query; 307 return $s if &$query; 308 } 309 return undef; 310} 311 312#<LIGT> 313# Here is everything new 314#</LIGT> 315sub parse { 316 my $self = shift; 317 my $record = shift; 318 return unless $record; 319 my $tags = shift; 320 my %ok; 321 %ok = map {$_ => 1} @$tags if ref($tags) eq 'ARRAY'; 322 my($s,@lines,$line,$accumulated,$key,$keyword,$value,$feature,@features, $label); 323 324 $s = new Stone; 325#<LIGT> following this line the parsing of the record must be done 326# each key-value pair is stored by the following command: 327# $self->_addToStone($key,$value,$stone,\%ok); 328# 329# Process new record lines 330# 331# 332 (@recordlines)=split(/\n/,$record); 333 undef $unigeneid, $title, $gene,$cytoband, $locuslink, $chromosome, $scount; 334 undef $sts, $txmap,$protsim,$sequence; 335 undef @sts,@txmaps,@protsims,@sequences; 336 foreach $line (@recordlines) { 337 if ($line=~/^ID/) { 338 ($key,$unigeneid)=split(/\s+/,$line); 339 $self->_addToStone('Identifier',$unigeneid,$s,\%ok); 340 } elsif ($line=~/^TITLE/) { 341 (@titles)=split(/\s+/,$line); 342 shift @titles; 343 $title=join(' ',@titles); 344 $self->_addToStone('Title',$title,$s,\%ok); 345 } elsif ($line=~/^GENE/) { 346 ($key,$gene)=split(/\s+/,$line); 347 $self->_addToStone('Gene',$gene,$s,\%ok); 348 } elsif ($line=~/^CYTOBAND/) { 349 ($key,$cytoband)=split(/\s+/,$line); 350 $self->_addToStone('Cytoband',$cytoband,$s,\%ok); 351 } elsif ($line=~/^LOCUSLINK/) { 352 ($key,$locuslink)=split(/\s+/,$line); 353 $self->_addToStone('Locuslink',$locuslink,$s,\%ok); 354 } elsif ($line=~/^CHROMOSOME/) { 355 ($key,$chromosome)=split(/\s+/,$line); 356 $self->_addToStone('Chromosome',$chromosome,$s,\%ok); 357 } elsif ($line=~/^SCOUNT/) { 358 ($key,$scount)=split(/\s+/,$line); 359 $self->_addToStone('Scount',$scount,$s,\%ok); 360 } elsif ($line=~/^STS/) { 361#STS ACC=XXX; NAME=YYY; 362 (@sts)=split(/\s+/,$line); shift @sts; $sts=join(' ',@sts); 363 ($tmpacc,$tmpname)=split(/\s+/,$sts); 364 ($jnk,$acc)=split(/\=/,$tmpacc); 365 ($jnk,$name)=split(/\=/,$tmpname); 366 367 undef @features; 368 $featurelabel="Accession"; $featurevalue=$name; 369 $feature = {'label'=>$featurelabel,'value'=>$featurevalue}; 370 push(@features,$feature); 371 $featurelabel="Name"; 372 $feature = {'label'=>$featurelabel,'value'=>$featurevalue}; 373 push(@features,$feature); 374 375 $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok); 376 } elsif ($line=~/^TXMAP/) { 377#TXMAP XXX; MARKER=YYY; RHPANEL=ZZZ; 378 (@txmaps)=split(/\s+/,$line); shift @txmaps; $txmap=join(' ',@txmaps); 379# $self->_addToStone('TXMAP',$txmap,$s,\%ok); 380 undef @features; 381 $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok); 382 } elsif ($line=~/^PROTSIM/) { 383#PROTSIM ORG=QQQ; PROTID=RRR; PCT=SSSS; ALN=TTTT; 384 (@protsims)=split(/\s+/,$line); shift @protsims; $protsim=join(' ',@protsims); 385# $self->_addToStone('PROTSIM',$protsim,$s,\%ok); 386 undef @features; 387 $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok); 388 } elsif ($line=~/^SEQUENCE/) { 389#SEQUENCE ACC=XXXX; NID=YYYY; PID=RRRRR; CLONE=QQQ; END=PPPP; LID=ZZZZ; 390 (@sequences)=split(/\s+/,$line); shift @sequences; $sequence=join(' ',@sequences); 391# $self->_addToStone('SEQUENCE',$sequence,$s,\%ok); 392 undef @features; 393 $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok); 394 } 395 } 396#</LIGT> 397 return $s; 398} 399 400# 401# Following is unchanged from Genbank.pm 402# 403sub read_one_record { 404 my($self,@tags) = @_; 405 my(%ok); 406 my $accessor = $self->{'accessor'}; 407 my $record = $accessor->fetch_next(); 408 unless ($record) { 409 $self->{'done'}++; 410 return undef; 411 } 412 413 return $self->parse($record,\@tags); 414} 415 416# 417# Following is unchanged from Genbank.pm 418# 419sub _trim { 420 my($v) = @_; 421 $v=~s/^\s+//; 422 $v=~s/\s+$//; 423 return $v; 424} 425 426# 427# Following is unchanged from Genbank.pm 428# 429sub _canonicalize { 430 my $h = shift; 431 substr($h,0)=~tr/a-z/A-Z/; 432 substr($h,1,length($h)-1)=~tr/A-Z/a-z/; 433 $h; 434} 435 436# 437# Following is unchanged from Genbank.pm 438# 439sub _addToStone { 440 my($self,$xlabel,$value,$stone,$ok) = @_; 441 return unless !%{$ok} || $ok->{$xlabel}; 442 $stone->insert(_canonicalize($xlabel),$value); 443} 444 445#<LIGT> 446# Following is entirely rewritten 447#</LIGT> 448sub _addFeaturesToStone { 449 my($self,$features,$basecount,$stone,$ok) = @_; 450 my($f) = new Stone; 451 foreach (@$features) { 452 my($q) = $_->{'value'}; 453 my($label) = _canonicalize($_->{'label'}); 454 my($position) = $q=~m!^([^/]+)!; 455 my @qualifiers = $q=~m!/(\w+)=([^/]+)!g; 456 my %qualifiers; 457 while (my($key,$value) = splice(@qualifiers,0,2)) { 458 $value =~ s/^\s*\"//; 459 $value =~s/\"\s*$//; 460 $value=~s/\s+//g if uc($key) eq 'TRANSLATION'; 461 $qualifiers{_canonicalize($key)} = $value; 462 } 463 $f->insert($label=>new Stone('Position'=>$position,%qualifiers)); 464 } 465 $stone->insert('Features',$f); 466} 467 468 469 470# -------------------------- DEFINITION OF ACCESSOR OBJECTS ------------------------------ 471#<LIGT> 472#only name changes for avoid namespace collisions 473#</LIGT> 474package UnigeneAccessor; 475use Carp; 476 477sub new { 478 my($class,@parameters) = @_; 479 croak "UnigeneAccessor::new: Abstract class\n"; 480} 481 482sub fetch_next { 483 my($self) = @_; 484 croak "UnigeneAccessor::fetch_next: Abstract class\n"; 485} 486 487sub DESTROY { 488} 489 490#<LIGT> 491# Following, only the File package since the only one supported. 492# If other access methods must be supported, then here appropriate 493# packages and methods must be implemented 494#</LIGT> 495package File; 496use Carp; 497@ISA=qw(UnigeneAccessor); 498$DEFAULT_PATH = Boulder::Unigene::DEFAULT_UNIGENE_PATH(); 499 500#<LIGT> 501# Following, removed the search for the string locus in the file 502# as validation that the input be compliant with parser 503#</LIGT> 504sub new { 505 my($package,$path) = @_; 506 $path = $DEFAULT_PATH unless $path; 507 open (UG,$path) or croak "File::new(): couldn't open $path: $!"; 508 # read the junk at the beginning 509 my $found; $found++; 510 croak "File::new(): $path doesn't look like a Unigene flat file" 511 unless $found; 512 $_ = <UG>; 513 return bless {'fh'=>UG},$package; 514} 515 516#<LIGT> 517# Following, changed the record separator 518#</LIGT> 519sub fetch_next { 520 my $self = shift; 521 return undef unless $self->{'fh'}; 522 local($/)="//\n"; 523 my($line); 524 my($fh) = $self->{'fh'}; 525 chomp($line = <$fh>); 526 return $line; 527} 528 5291; 530 531__END__ 532 533