1# $Id: XML.pm,v 1.2 1999/11/30 21:06:05 lstein Exp $ 2 3# Boulder::XML 4# 5# XML input/output for Stone objects 6 7package Boulder::XML; 8 9=head1 NAME 10 11Boulder::XML - XML format input/output for Boulder streams 12 13=head1 SYNOPSIS 14 15 use Boulder::XML; 16 17 $stream = Boulder::XML->newFh; 18 19 while ($stone = <$stream>) { 20 print $stream $stone; 21 } 22 23=head1 DESCRIPTION 24 25Boulder::XML generates BoulderIO streams from XML files and/or 26streams. It is also able to output Boulder Stones in XML format. Its 27semantics are similar to those of Boulder::Stream, except that there 28is never any pass-through behavior. 29 30Because XML was not designed for streaming, some care must be taken 31when reading an XML document into a series of Stones. Consider this 32XML document: 33 34 <?xml version="1.0" standalone="yes"?> 35 36 <Paper> 37 <Author>Lincoln Stein</Author> 38 <Author>Jean Siao</Author> 39 <Date>September 29, 1999</Date> 40 <Copyright copyrighted="yes">1999 Lincoln Stein</Copright> 41 <Abstract> 42 This is the abstract. It is not anything very fancy, 43 but it will do. 44 </Abstract> 45 <Citation> 46 <Author>Fitchberg J</Author> 47 <Journal>Journal of Irreproducible Results</Journal> 48 <Volume>23</Volume> 49 <Year>1998</Volume> 50 </Citation> 51 <Citation> 52 <Author>Clemenson V</Author> 53 <Journal>Ecumenica</Journal> 54 <Volume>10</Volume> 55 <Year>1968</Volume> 56 </Citation> 57 <Citation> 58 <Author>Ruggles M</Author> 59 <Journal>Journal of Aesthetic Surgery</Journal> 60 <Volume>10</Volume> 61 <Year>1999</Volume> 62 </Citation> 63 </Paper> 64 65Ordinarily the document will be construed as a single Paper tag 66containing subtags Author, Date, Copyright, Abstract, and so on. 67However it might be desirable to fetch out just the citation tags as a 68series of Stones. In this case, you can declare Citation to be the 69top level tag by passing the B<-tag> argument to new(). Now calling 70get() will return each of the three Citation sections in turn. If no 71tag is explicitly declared to be the top level tag, then Boulder::XML 72will take the first tag it sees in the document. 73 74It is possible to stream XML files. You can either separate them into 75separate documents and use the automatic ARGV processing features of 76the BoulderIO library, or separate the XML documents using a 77B<delimiter> string similar to the delimiters used in MIME multipart 78documents. By default, BoulderIO uses a delimiter of 79E<lt>!--Boulder::XML--E<gt>. 80 81B<This is not a general XML parsing engine!> Instead, it is a way to 82represent BoulderIO tag/value streams in XML format. The module uses 83XML::Parser to parse the XML streams, and therefore any syntactic 84error in the stream can cause the XML parser to quit with an error. 85Another thing to be aware of is that there are certain XML 86constructions that will not translate into BoulderIO format, specifically 87free text that contains embedded tags. This is OK: 88 89 <Author>Jean Siao</Author> 90 91but this is not: 92 93 <Author>The <Emphatic>extremely illustrious</Emphatic> Jean Siao</Author> 94 95In BoulderIO format, tags can contain other tags or text, but cannot 96contain a mixture of tags and text. 97 98=head2 CONSTRUCTORS 99 100=over 4 101 102=item $stream = Boulder::XML->new(*IN,*OUT); 103 104=item $stream = Boulder::XML->new(-in=>*IN,-out=>*OUT,-tag=>$tag,-delim=>$delim,-strip=>$strip) 105 106new() creates a new Boulder::XML stream that can be read from or 107written to. All arguments are optional. 108 109 -in Filehandle to read from. 110 If a file name is provided, will open the file. 111 Defaults to the magic <> filehandle. 112 113 -out Filehandle to write to. 114 If a file name is provided, will open the file for writing. 115 Defaults to STDOUT 116 117 -tag The top-level XML tag to consider as the Stone record. Defaults 118 to the first tag seen when reading from an XML file, or to 119 E<lt>StoneE<gt> when writing to an output stream without 120 previously having read. 121 122 -delim Delimiter to use for delimiting multiple Stone objects in an 123 XML stream. 124 125 -strip If true, automatically strips leading and trailing whitespace 126 from text contained within tags. 127 128=item $fh = Boulder::XML->newFh(*IN,*OUT); 129 130=item $fh = Boulder::XML->newFh(-in=>*IN,-out=>*OUT,-tag=>$tag,-delim=>$delim,-strip=>$strip) 131 132The newFh() constructor creates a tied filehandle that can read and 133write Boulder::XML streams. Invoking <> on the filehandle will 134perform a get(), returning a Stone object. Calling print() on the 135filehandle will perform a put(), writing a Stone object to output in 136XML format. 137 138=back 139 140=head2 METHODS 141 142=over 4 143 144=item $stone = $stream->get() 145 146=item $stream->put($stone) 147 148=item $done = $stream->done 149 150All these methods have the same semantics as the similar methods in 151L<Boulder::Stream>, except that pass-through behavior doesn't apply. 152 153=back 154 155=head1 AUTHOR 156 157Lincoln D. Stein <lstein@cshl.org>, Cold Spring Harbor Laboratory, 158Cold Spring Harbor, NY. This module can be used and distributed on 159the same terms as Perl itself. 160 161=head1 SEE ALSO 162 163L<Boulder>, L<Boulder::Stream>, L<Stone> 164 165=cut 166 167use Boulder::Stream; 168use Stone; 169use XML::Parser; 170 171use strict; 172use vars qw(@ISA); 173 174@ISA = 'Boulder::Stream'; 175*rearrange = \&Boulder::Stream::rearrange; 176*put = \&write_record; 177 178sub new { 179 my $package = shift; 180 my($in,$out,$tag,$delim,$strip) = rearrange(['IN','OUT','TAG','DELIM','STRIP'],@_); 181 my $self = bless { 182 'top_level' => $tag, 183 'delim' => $delim || '<!--Boulder::XML-->', 184 'strip' => $strip, 185 'in' => Boulder::Stream->to_fh($in) || \*ARGV, 186 'out' => Boulder::Stream->to_fh($out,1) || \*STDOUT, 187 },$package; 188 my $parser = XML::Parser->new( 189 ErrorContext => 2, 190 Stream_Delimiter => $self->{delim}, 191 ); 192 @ARGV = ('-') if $self->{in} == \*ARGV and !@ARGV; 193 $parser->setHandlers( 194 Start => sub { $self->_start(@_) }, 195 Default => sub { $self->_default(@_) }, 196 End => sub { $self->_end(@_) } 197 ); 198 $self->{'parser'} = $parser; 199 return $self; 200} 201 202sub read_one_record { 203 my ($self,@tags) = @_; 204 return shift @{$self->{stones}} if $self->{stones} && @{$self->{stones}}; 205 my $fh = $self->magic_file_open || return; 206 $self->{parser}->parse($fh); 207 return shift @{$self->{stones}}; 208} 209 210sub write_record { 211 my $self = shift; 212 my @stone = @_; 213 my $out = $self->{out}; 214 print $out $self->{delim},"\n" if $self->{printed}++; 215 print $out qq(<?xml version="1.0" standalone="yes"?>\n\n); 216 for my $stone (@stone) { 217 next unless ref $stone && $stone->can('asXML'); 218 print $out $stone->asXML($self->{top_level}); 219 } 220} 221 222sub magic_file_open { 223 my $self = shift; 224 my $fh = $self->{in}; 225 return $fh unless $fh == \*main::ARGV; 226 return $fh unless eof $fh; 227 return unless my $a = shift @ARGV; 228 open $fh,$a or die "$a: $!"; 229 return $fh; 230} 231 232sub done { 233 my $self = shift; 234 return if defined $self->{stones} && @{$self->{stones}}; 235 return eof $self->{in} if $self->{in} != \*main::ARGV; 236 return $self->{in} && eof $self->{in} && !@ARGV; 237} 238 239sub _default { 240 my ($self,$p, $string) = @_; 241 return unless $string=~/\S/; 242 if ($self->{'strip'}) { # strip leading whitespace 243 $string =~ s/^\s+//; 244 $string =~ s/\s+$//; 245 } 246 247 return unless $self->{stack} && @{$self->{stack}}; 248 my $stone = $self->{stack}[-1]; 249 my $current = $stone->name(); 250 $current .= $string; 251 $stone->name($current); 252} 253 254sub _start { 255 my ($self,$p, $element, %attributes) = @_; 256 257 $self->{top_level} ||= $element; 258 if ($element eq $self->{top_level}) { 259 $self->{stack} = [$self->{stone} = new Stone]; # empty stone 260 $self->{stone}->attributes(\%attributes) if %attributes; 261 return; 262 } 263 264 return unless $self->{stack}[-1]; 265 my $s = new Stone; 266 $self->{stack}[-1]->insert($element => $s); 267 push(@{$self->{stack}},$s); 268 $s->attributes(\%attributes) if %attributes; 269} 270 271sub _end { 272 my ($self,$p, $element) = @_; 273 274 pop @{$self->{stack}}; 275 276 if ( $element eq $self->{top_level} ) { 277 push @{$self->{stones}},$self->{stone}; 278 delete $self->{stone}; 279 delete $self->{stack}; 280 } 281 282} # End end 283 284 2851; 286