1package Chemistry::File::SLN; 2 3$VERSION = "0.11"; 4# $Id: SLN.pm,v 1.4 2005/03/29 16:38:06 itubert Exp $ 5 6use 5.006; 7use strict; 8use warnings; 9use base "Chemistry::File"; 10use Chemistry::Mol; 11use Chemistry::File::SLN::Parser; 12use Chemistry::Bond::Find 'assign_bond_orders'; 13use List::Util qw(sum); 14 15=head1 NAME 16 17Chemistry::File::SLN - SLN linear notation parser/writer 18 19=head1 SYNOPSYS 20 21 #!/usr/bin/perl 22 use Chemistry::File::SLN; 23 24 # parse a SLN string for benzene 25 my $s = 'C[1]H:CH:CH:CH:CH:CH@1'; 26 my $mol = Chemistry::Mol->parse($s, format => 'sln'); 27 28 # print a SLN string 29 print $mol->print(format => 'sln'); 30 31 # print a unique (canonical) SLN string 32 print $mol->print(format => 'sln', unique => 1); 33 34 # parse a multiline SLN file 35 my @mols = Chemistry::Mol->read("file.sln", format => 'sln'); 36 37 # write a multiline SLN file 38 Chemistry::Mol->write("file.sln", mols => [@mols]); 39 40 41=head1 DESCRIPTION 42 43This module parses a SLN (Sybyl Line Notation) string. This is a File I/O 44driver for the PerlMol project. L<http://www.perlmol.org/>. It registers the 45'sln' format with Chemistry::Mol, and recognizes filenames ending in '.sln'. 46 47Optional attributes for atoms, bonds, and molecules are stored as 48$atom->attr("sln/attr"), $bond->attr("sln/attr"), and $mol->attr("sln/attr"), 49respectively. Boolean attributes are stored with a value of 'TRUE'. That's the 50way boolean attributes are recognized when writing, so that they can be written 51in the shortened form. 52 53 $sln_attr->{backbone} = 1; 54 # would be ouput as "C[backbone=1]" 55 56 $sln_attr->{backbone} = 'TRUE'; 57 # would be ouput as "C[backbone]" 58 59Also note that attribute names are normalized to lowercase on reading. 60 61=head1 OPTIONS 62 63The following options are available when reading: 64 65=over 66 67=item kekulize 68 69Assign bond orders for unsatisfied valences or for aromatic bonds. For example, 70benzene read as C[1]H:CH:CH:CH:CH:CH@1 will be converted internally to 71something like C[1]H=CHCH=CHCH=CH@1. This is needed if another format or 72module expects a Kekule representation without an aromatic bond type. 73 74=back 75 76The following options are available when writing: 77 78=over 79 80=item mols 81 82If this option points to an array of molecules, these molecules will be 83written, one per line, as in the example in the SYNOPSYS. 84 85=item aromatic 86 87Detect aromaticity before writing. This will ensure that aromatic bond types 88are used instead of alternate single and double bonds. 89 90=item unique 91 92Canonicalize before writing, and produce a unique strucure. NOTE: this option 93does not guarantee a unique representation for molecules with bracketed 94attributes. 95 96=item name 97 98Include the name of the molecule ($mol->name) in the output string. 99 100=item coord3d, coords 101 102Include the 3D coordinates of every atom in the molecule in the output string. 103C<coord3d> and C<coords> may be used interchangeably. 104 105=item attr 106 107Output the atom, bond, and molecule attributes found in $mol->attr("sln/attr"), 108etc. 109 110=back 111 112=head1 CAVEATS 113 114This version does not implement the full SLN specification. It supports 115simple structures and some attributes, but it does not support any of the 116following: 117 118=over 119 120=item Macro atoms 121 122=item Pattern matching options 123 124=item Markush structures 125 126=item 2D Coordinates 127 128=back 129 130The SLN specification is vague on several points, and I don't have a reference 131implementation available, so I had to make several arbitrary decisions. Also, 132this version of this module has not been tested exhaustively, so please report 133any bugs that you find. 134 135If the parser doesn't understand a string, it only says "syntax error", which 136may not be very helpful. 137 138=cut 139 140# INITIALIZATION 141Chemistry::Mol->register_format('sln'); 142my $Parser = Chemistry::File::SLN::Parser->new; 143 144sub name_is { 145 my ($self, $name) = @_; 146 $name =~ /\.sln$/i; 147} 148 149sub file_is { 150 $_[0]->name_is($_[1]); 151} 152 153sub parse_string { 154 my ($self, $string, %opts) = @_; 155 156 my (@lines) = split /(?:\n|\r\n?)/, $string; 157 my @mols; 158 for my $line (@lines) { 159 my $mol = $self->parse_single_line($line, %opts); 160 return $mol unless wantarray; 161 push @mols, $mol; 162 } 163 @mols; 164} 165 166sub parse_single_line { 167 my ($self, $string, %opts) = @_; 168 169 my $mol_class = $opts{mol_class} || "Chemistry::Mol"; 170 171 172 # call the actual yapp-generated parser 173 my $tree = $Parser->run($string) or return; 174 #use Data::Dumper; print Dumper $tree; 175 176 my $mol = $mol_class->new; 177 my @nodes = @{$tree->{chain}}; 178 my %closures; 179 my $last_atom; 180 my @stack; 181 182 while (my $node = shift @nodes) { 183 if ($node eq '(') { 184 push @stack, $last_atom; 185 } elsif ($node eq ')') { 186 $last_atom = pop @stack; 187 } elsif($last_atom) { # bond 188 my $next = shift @nodes; 189 if ($next->{closure}) { 190 my $atom = $closures{$next->{closure}}; 191 $self->compile_bond($mol, $node, $last_atom, $atom); 192 } else { 193 my $atom = $self->compile_atom($mol, $next, \%closures); 194 $self->compile_bond($mol, $node, $last_atom, $atom); 195 $last_atom = $atom; 196 } 197 } else { # first atom 198 $last_atom = $self->compile_atom($mol, $node, \%closures); 199 } 200 } 201 if ($opts{kekulize}) { 202 assign_bond_orders($mol, method => "itub", use_coords => 0, 203 scratch => 0, charges => 0); 204 } 205 my @sln_attr; 206 while (my ($attr, $value) = each %{$tree->{attr}}) { 207 if ($attr eq 'name') { 208 $mol->name($value); 209 } elsif ($attr eq 'type') { 210 $mol->type($value); 211 } elsif ($attr eq 'coord3d') { 212 $self->read_coords($mol, $value); 213 } else { 214 push @sln_attr, $attr, $value; 215 } 216 } 217 $mol->attr("sln/attr", {@sln_attr}) if @sln_attr; 218 $mol; 219 220} 221 222sub compile_atom { 223 my ($self, $mol, $node, $closures) = @_; 224 my $atom = $mol->new_atom( 225 symbol => $node->{symbol}, 226 hydrogens => $node->{hcount}, 227 formal_charge => $node->{attr}{charge}, 228 ); 229 $atom->attr("sln/attr", $node->{attr}); 230 delete $node->{attr}{charge}; 231 $closures->{$node->{id}} = $atom if $node->{id}; 232 $atom; 233} 234 235my %TYPE_TO_ORDER = ( 236 '-' => 1, 237 '=' => 2, 238 '#' => 3, 239 ':' => 1, 240 '.' => 0, 241); 242 243sub compile_bond { 244 my ($self, $mol, $node, $atom1, $atom2) = @_; 245 my $order = $TYPE_TO_ORDER{$node->{type}}; 246 if ($order) { 247 my $bond = $mol->new_bond( 248 type => $node->{type}, 249 atoms=>[$atom1, $atom2], 250 order => $order, 251 ); 252 $bond->attr("sln/attr", $node->{attr}); 253 if ($node->{type} eq ':') { 254 $_->aromatic(1) for ($atom1, $atom2, $bond); 255 } 256 } 257} 258 259sub read_coords { 260 my ($self, $mol, $coords_str) = @_; 261 $coords_str =~ s/[()]//g; 262 my (@coords) = split /,/, $coords_str; 263 my $fh = $mol->formula_hash; 264 my $n = sum(values %$fh); 265 my $sprout = (@coords == 3*$n); 266 for my $atom ($mol->atoms) { 267 $atom->coords(splice @coords, 0, 3); 268 if ($sprout) { 269 for (1 .. $atom->implicit_hydrogens) { 270 my $H = $mol->new_atom(symbol => 'H', 271 coords => [splice @coords, 0, 3]); 272 $mol->new_bond(atoms => [$atom, $H]); 273 } 274 $atom->implicit_hydrogens(0); 275 } 276 } 277} 278 279 280########### WRITER ################# 281 282 283sub write_string { 284 my ($self, $mol_ref, %opts) = @_; 285 286 my $eol; 287 my @mols; 288 if ($opts{mols}) { 289 @mols = @{$opts{mols}}; 290 $eol = "\n"; 291 } else { 292 @mols = $mol_ref; 293 $eol = ""; 294 } 295 296 my $sln; 297 for my $mol (@mols) { 298 $sln .= $self->write_mol($mol, %opts) . $eol; 299 } 300 $sln; 301} 302 303sub write_mol { 304 my ($self, $mol, %opts) = @_; 305 306 my $oldmol = $mol; 307 $mol = $mol->clone; 308 309 my $sln = ''; 310 my @id_log; 311 if ($mol->atoms) { 312 my @atoms = $self->clean_mol($mol, %opts); 313 314 my $visited = {}; 315 my @s; 316 for my $atom (@atoms) { 317 next if $visited->{$atom}; 318 my $ring_atoms = {}; 319 320 # first pass to find and number the ring bonds 321 $self->find_ring_bonds($mol, \%opts, $atom, undef, {}, $ring_atoms); 322 323 # second pass to actually generate the sln string 324 push @s, $self->branch($mol, \%opts, $atom, undef, $visited, 325 $ring_atoms, \@id_log); 326 } 327 $sln .= join '.', @s; 328 } 329 330 $sln .= $self->format_ctab_attr($mol, \%opts, $oldmol, \@id_log); 331} 332 333sub clean_mol { 334 my ($self, $mol, %opts) = @_; 335 336 $self->collapse_hydrogens($mol); 337 my @atoms = $mol->atoms; 338 if ($opts{unique}) { 339 unless ($atoms[0]->attr("canon/class")) { 340 require Chemistry::Canonicalize; 341 Chemistry::Canonicalize::canonicalize($mol); 342 } 343 #$opts{aromatic} = 1; # all unique sln have to be aromatic 344 @atoms = sort { 345 $a->attr("canon/class") <=> $b->attr("canon/class") 346 } @atoms; 347 } 348 349 if ($opts{aromatic}) { 350 require Chemistry::Ring; 351 Chemistry::Ring::aromatize_mol($mol); 352 } 353 @atoms; 354} 355 356sub format_ctab_attr { 357 my ($self, $mol, $opts, $oldmol, $id_log) = @_; 358 359 my $sln = ''; 360 if ($opts->{name} or $opts->{attr} or $opts->{coords} or $opts->{coord3d}) { 361 no warnings 'uninitialized'; 362 my @attr; 363 my $name = $mol->name; 364 $name =~ s/[\r\n]//g; 365 push @attr, 'name="' . $mol->name . '"' 366 if $opts->{name} and length $mol->name; 367 my @coords; 368 if ($opts->{coord3d} or $opts->{coords}) { 369 my @all_atoms = map { 370 ( 371 $oldmol->by_id($_), 372 grep {$_->symbol eq 'H'} 373 $oldmol->by_id($_)->neighbors 374 ) 375 } @$id_log; 376 push @coords, sprintf("(%.3f,%.3f,%.3f)",$_->coords->array) 377 for @all_atoms; 378 push @attr, 'coord3d=' . join(',',@coords); 379 } 380 if ($opts->{attr}) { 381 push @attr, $self->format_sln_attr($mol); 382 } 383 $sln .= '<' . join(';', @attr) . '>' if @attr; 384 } 385 $sln; 386} 387 388 389sub find_ring_bonds { 390 my ($self, $mol, $opts, $atom, $from_bond, $visited, $ring_atoms) = @_; 391 392 $visited->{$atom} = 1; 393 for my $bn (sorted_bonds_neighbors($atom, $opts)) { 394 my $nei = $bn->{to}; 395 my $bond = $bn->{bond}; 396 next if $visited->{$bond}; 397 $visited->{$bond} = 1; 398 if ($visited->{$nei}) { # closed ring 399 #print "closing ring\n"; 400 $ring_atoms->{$nei}++; 401 } else { 402 $self->find_ring_bonds($mol, $opts, $nei, 403 $bond, $visited, $ring_atoms); 404 } 405 } 406} 407 408sub branch { 409 my ($self, $mol, $opts, $atom, $from_bond, $visited, $digits, $id_log) = @_; 410 411 my $prev_branch = ""; 412 my $sln; 413 $sln .= $self->format_bond($from_bond, $opts); 414 my $digit; 415 if ($digits->{$atom}) { # opening a ring 416 $digit = $self->next_digit($digits); 417 $digits->{$atom} = $digit; 418 } 419 $sln .= $self->format_atom($atom, $opts, $digit); 420 push @$id_log, $atom->id; 421 422 $visited->{$atom} = 1; 423 my @bns = sorted_bonds_neighbors($atom, $opts); 424 425 for my $bn (@bns) { 426 my $nei = $bn->{to}; 427 my $bond = $bn->{bond}; 428 next if $visited->{$bond}; 429 $visited->{$bond} = 1; 430 if ($visited->{$nei}) { # closed a ring 431 if ($prev_branch) { 432 $sln .= "($prev_branch)"; 433 } 434 $prev_branch = $self->format_bond($bond, $opts) 435 . '@' . $digits->{$nei}; 436 $visited->{$bond} = 1; 437 } else { 438 my $branch = $self->branch($mol, $opts, $nei, $bond, $visited, 439 $digits, $id_log); 440 if ($prev_branch) { 441 $sln .= "($prev_branch)"; 442 } 443 $prev_branch = $branch; 444 } 445 } 446 $sln .= "$prev_branch"; 447 $sln; 448} 449 450sub next_digit { 451 my ($self, $digits) = @_; 452 ++$digits->{used_digits}; 453} 454 455sub collapse_hydrogens { 456 my ($self, $mol) = @_; 457 458 for my $atom (grep {$_->symbol eq 'H'} $mol->atoms) { 459 my ($neighbor) = $atom->neighbors or next; 460 $atom->delete; 461 my $h_count = $neighbor->hydrogens; 462 $h_count++; 463 $neighbor->hydrogens($h_count); 464 } 465} 466 467sub sorted_bonds_neighbors { 468 my ($atom, $opts) = @_; 469 my @bn = $atom->bonds_neighbors; 470 if ($opts->{unique}) { 471 @bn = sort { 472 $a->{to}->attr("canon/class") <=> $b->{to}->attr("canon/class") 473 } @bn; 474 } 475 @bn; 476} 477 478my %ORDER_TO_TYPE = ( 479 1 => '', 2 => '=', 3 => '#', 4 => '', 0 => '.', 480); 481 482sub format_bond { 483 my ($self, $bond, $opts) = @_; 484 return '' unless $bond; 485 my $s = $bond->aromatic ? ':' : $ORDER_TO_TYPE{$bond->order}; 486 my @attr; 487 @attr = $self->format_sln_attr($bond) if $opts->{attr}; 488 if (@attr) { 489 $s .= '[' . join(";", @attr) . ']'; 490 } 491 $s; 492} 493 494sub format_atom { 495 my ($self, $atom, $opts, $digit) = @_; 496 my $s; 497 no warnings 'uninitialized'; 498 my $h_count = $atom->hydrogens; 499 my $charge = $atom->formal_charge; 500 my $symbol = $atom->symbol; 501 502 $charge = $charge ? sprintf("%+d", $charge): ''; 503 $h_count = $h_count ? ($h_count > 1 ? "H$h_count" : 'H') : ''; 504 505 $s = $symbol; 506 my @attr; 507 @attr = $self->format_sln_attr($atom) if $opts->{attr}; 508 if ($charge or $digit or @attr) { 509 $s .= '['; 510 $s .= $digit; 511 unshift @attr, $charge if $charge; 512 if (@attr) { 513 $s .= ':' if $digit; 514 $s .= join ';', @attr; 515 } 516 $s .= ']'; 517 } 518 $s .= $h_count; 519 $s; 520} 521 522sub format_sln_attr { 523 my ($self, $obj) = @_; 524 my $sln_attr = $obj->attr("sln/attr") || {}; 525 my @attr; 526 for my $key (sort keys %$sln_attr) { 527 my $val = $sln_attr->{$key}; 528 push @attr, "$key" . ($val eq 'TRUE' ? "" : "=$val"); 529 } 530 @attr; 531} 532 5331; 534 535=head1 VERSION 536 5370.11 538 539=head1 SEE ALSO 540 541L<Chemistry::Mol>, L<Chemistry::File>, L<Chemistry::File::SMILES> 542 543The PerlMol website L<http://www.perlmol.org/> 544 545Ash, S.; Cline, M. A.; Homer, R. W.; Hurst, T.; Smith, G. B., SYBYL Line 546Notation (SLN): A Versatile Language for Chemical Structure Representation. J. 547Chem. Inf. Comput. Sci; 1997; 37(1); 71-79. DOI: 10.1021/ci960109j 548(L<http://dx.doi.org/10.1021/ci960109j>) 549 550=head1 AUTHOR 551 552Ivan Tubert-Brohman E<lt>itub@cpan.orgE<gt> 553 554=head1 COPYRIGHT 555 556Copyright (c) 2004 Ivan Tubert-Brohman. All rights reserved. This program is 557free software; you can redistribute it and/or modify it under the same terms as 558Perl itself. 559 560=cut 561 562