1package Chemistry::File::SMILES; 2 3$VERSION = "0.44"; 4# $Id: SMILES.pm,v 1.14 2005/03/29 23:48:34 itubert Exp $ 5 6use 5.006; 7use strict; 8use warnings; 9use base "Chemistry::File"; 10use Chemistry::Mol; 11use Chemistry::Bond::Find 'assign_bond_orders'; 12use List::Util 'first'; 13use Carp; 14 15 16=head1 NAME 17 18Chemistry::File::SMILES - SMILES linear notation parser/writer 19 20=head1 SYNOPSYS 21 22 #!/usr/bin/perl 23 use Chemistry::File::SMILES; 24 25 # parse a SMILES string 26 my $s = 'C1CC1(=O)[O-]'; 27 my $mol = Chemistry::Mol->parse($s, format => 'smiles'); 28 29 # print a SMILES string 30 print $mol->print(format => 'smiles'); 31 32 # print a unique (canonical) SMILES string 33 print $mol->print(format => 'smiles', unique => 1); 34 35 # parse a SMILES file 36 my @mols = Chemistry::Mol->read("file.smi", format => 'smiles'); 37 38 # write a multiline SMILES file 39 Chemistry::Mol->write("file.smi", mols => \@mols); 40 41 42=head1 DESCRIPTION 43 44This module parses a SMILES (Simplified Molecular Input Line Entry 45Specification) string. This is a File I/O driver for the PerlMol project. 46L<http://www.perlmol.org/>. It registers the 'smiles' format with 47Chemistry::Mol. 48 49This parser interprets anything after whitespace as the molecule's name; 50for example, when the following SMILES string is parsed, $mol->name will be 51set to "Methyl chloride": 52 53 CCl Methyl chloride 54 55The name is not included by default on output. However, if the C<name> option 56is defined, the name will be included after the SMILES string, separated by a 57tab. 58 59 print $mol->print(format => 'smiles', name => 1); 60 61=head2 Multiline SMILES and SMILES files 62 63A file or string can contain multiple molecules, one per line. 64 65 CCl Methyl chloride 66 CO Methanol 67 68Files with the extension '.smi' are assumed to have this format. 69 70=head2 Atom Mapping Numbers 71 72As an extension for reaction processing, SMILES strings may have atom mapping 73numbers, which are introduced after a colon in a bracketed atom. For example, 74[C:1]. The mapping number need not be unique. This module reads the mapping 75numbers and stores them as the name of the atom ($atom->name). 76 77On output, atom names are not included by default. See the C<number> and 78C<auto_number> options below for ways of including them. 79 80head1 OPTIONS 81 82The following options are supported in addition to the options mentioned for 83L<Chemistry::File>, such as C<mol_class>, C<format>, and C<fatal>. 84 85=over 86 87=item aromatic 88 89On output, detect aromatic atoms and bonds by means of the Chemistry::Ring 90module, and represent the organic aromatic atoms with lowercase symbols. 91 92=item unique 93 94When used on output, canonicalize the structure if it hasn't been canonicalized 95already and generate a unique SMILES string. This option implies "aromatic". 96 97=item number 98 99For atoms that have a defined name, print the name as the "atom number". For 100example, if an ethanol molecule has the name "42" for the oxygen atom and the 101other atoms have undefined names, the output would be: 102 103 CC[OH:42] 104 105=item auto_number 106 107When used on output, number all the atoms explicitly and sequentially. The 108output for ethanol would look something like this: 109 110 [CH3:1][CH2:2][OH:3] 111 112=item name 113 114Include the molecule name on output, as described in the previous section. 115 116=item kekulize 117 118When used on input, assign single or double bond orders to "aromatic" or 119otherwise unspecified bonds (i.e., generate the Kekule structure). If false, 120the bond orders will remain single. This option is true by default. This uses 121C<assign_bond_orders> from the L<Chemistry::Bond::Find> module. 122 123=back 124 125=cut 126 127# INITIALIZATION 128Chemistry::Mol->register_format('smiles'); 129my $Smiles_parser = __PACKAGE__->new_parser; 130 131#=begin comment 132# 133#=over 134# 135#=cut 136 137sub file_is { 138 my $self = shift; 139 $self->name_is(@_); 140} 141 142sub name_is { 143 my ($self, $name) = @_; 144 $name =~ /\.smi/; 145} 146 147sub slurp_mol { 148 my ($self, $fh) = @_; 149 scalar <$fh>; 150} 151 152sub read_mol { 153 my ($self, $fh, %opts) = @_; 154 %opts = (kekulize => 1, %opts); 155 my $mol_class = $opts{mol_class} || "Chemistry::Mol"; 156 157 my $line = <$fh>; 158 return unless defined $line; 159 $line =~ tr/\r\n//d; 160 my ($smiles, $name) = split " ", $line, 2; 161 162 my $mol = $mol_class->new; 163 unless ($Smiles_parser->parse($smiles, $mol, \%opts)) { 164 warn "error parsing SMILES line '$line'\n"; 165 $mol = $mol_class->new; 166 } 167 $mol->name($name); 168 $self->add_implicit_hydrogens($mol); 169 if ($opts{kekulize}) { 170 assign_bond_orders($mol, method => "itub", use_coords => 0, 171 scratch => 0, charges => 0); 172 } 173 $mol; 174} 175 176 177### The contents of the original Chemistry::Smiles module start below 178 179my $Symbol = qr/ 180 s|p|o|n|c|b|Zr|Zn|Yb|Y|Xe|W|V|U|Tm|Tl|Ti|Th| 181 Te|Tc|Tb|Ta|Sr|Sn|Sm|Si|Sg|Se|Sc|Sb|S|Ru|Rn|Rh|Rf|Re|Rb|Ra| 182 Pu|Pt|Pr|Po|Pm|Pd|Pb|Pa|P|Os|O|Np|No|Ni|Ne|Nd|Nb|Na|N|Mt|Mt| 183 Mo|Mn|Mg|Md|Lu|Lr|Li|La|Kr|K|Ir|In|I|Hs|Hs|Ho|Hg|Hf|He|H|Ge| 184 Gd|Ga|Fr|Fm|Fe|F|Eu|Es|Er|Dy|Ds|Db|Cu|Cs|Cr|Co|Cm|Cl|Cf|Ce| 185 Cd|Ca|C|Br|Bk|Bi|Bh|Be|Ba|B|Au|At|As|Ar|Am|Al|Ag|Ac|\* 186/x; # Order is reverse alphabetical to ensure longest match 187 188my $Simple_symbol = qr/Br|Cl|B|C|N|O|P|S|F|I|H|s|p|o|n|c|b/; 189 190my $Bond = qr/(?:[-=#:.\/\\])?/; 191my $Simple_atom = qr/($Simple_symbol)/; #3 192my $Complex_atom = qr/ 193 (?: 194 \[ #begin atom 195 (\d*) #4 isotope 196 ($Symbol) #5 symbol 197 (\@{0,2}) #6 chirality 198 (?:(H\d*))? #7 H-count 199 (\+{2,}|-{2,}|\+\d*|-\d*)? #8 charge 200 (?::(\d+))? #9 name 201 \] #end atom 202 ) 203/x; 204 205my $Digits = qr/(?:($Bond)(?:\d|%\d\d))*/; 206my $Chain = qr/ 207 \G( #1 208 (?: 209 ($Bond) #2 210 (?:$Simple_atom|$Complex_atom) #3-9 211 ($Digits) #10 212 ) 213 |\( 214 |\) 215 |.+ 216 ) 217/x; 218 219my $digits_re = qr/($Bond)(\%\d\d|\d)/; 220 221my %type_to_order = ( 222 '-' => 1, 223 '=' => 2, 224 '#' => 3, 225 '/' => 1, 226 '\\' => 1, 227 '' => 1, # not strictly true 228 '.' => 0, 229); 230 231my %ORGANIC_ELEMS = ( 232 Br => 1, Cl => 1, B => 3, C => 4, N => 3, O => 2, P => 3, S => 2, 233 F => 1, I => 1, s => 1, p => 1, o => 1, n => 1, c => 1, b => 1, 234); 235 236#=item Chemistry::Smiles->new([add_atom => \&sub1, add_bond => \&sub2]) 237# 238#Create a SMILES parser. If the add_atom and add_bond subroutine references 239#are given, they will be called whenever an atom or a bond needs to be added 240#to the molecule. If they are not specified, default methods, which 241#create a Chemistry::Mol object, will be used. 242# 243#=cut 244 245sub new_parser { 246 my $class = shift; 247 my %opts = @_; 248 my $self = bless { 249 add_atom => $opts{add_atom} || \&add_atom, 250 add_bond => $opts{add_bond} || \&add_bond, 251 }, $class; 252} 253 254#=item $obj->parse($string, $mol) 255# 256#Parse a Smiles $string. $mol is a "molecule state object". It can be anything; 257#the parser doesn't do anything with it except sending it as the first parameter 258#to the callback functions. If callback functions were not provided when 259#constructing the parser object, $mol must be a Chemistry::Mol object, because 260#that's what the default callback functions require. 261# 262#=cut 263 264sub parse { 265 my ($self, $s, $mol, $opts) = @_; 266 $self->{stack} = [ undef ]; 267 $self->{digits} = {}; 268 269 eval { 270 while ($s =~ /$Chain/g) { 271 #my @a = ($1, $2, $3, $4, $5, $6, $7, $8); 272 #print Dumper(\@a); 273 my ($all, $bnd, $sym, $iso, $sym2, $chir, $hcnt, $chg, $name, $dig) 274 = ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10); 275 if ($all eq '(') { 276 $self->start_branch(); 277 } elsif ($all eq ')') { 278 $self->end_branch(); 279 } elsif ($sym) { # Simple atom 280 no warnings; 281 my @digs = parse_digits($dig); 282 $self->atom($mol, $bnd, '', $sym, '', undef, '', \@digs); 283 } elsif ($sym2) { # Complex atom 284 no warnings; 285 my @digs = parse_digits($dig); 286 if ($hcnt eq 'H') { 287 $hcnt = 1; 288 } else { 289 $hcnt =~ s/H//; 290 } 291 unless ($chg =~ /\d/) { 292 $chg = ($chg =~ /-/) ? -length($chg) : length($chg); 293 } 294 $self->atom($mol, $bnd, $iso, $sym2, $chir, $hcnt || 0, 295 $chg, \@digs, $name); 296 } else { 297 die "SMILES ERROR: '$all in $s'\n"; 298 } 299 } 300 }; 301 # clean up to avoid memory leak 302 $self->{stack} = undef; 303 if ($@) { 304 croak $@ if $opts->{fatal}; 305 return; 306 } 307 $mol; 308} 309 310sub parse_digits { 311 my ($dig) = @_; 312 my @digs; 313 while ($dig && $dig =~ /$digits_re/g) { 314 push @digs, {bnd=>$1, dig=>$2}; 315 } 316 @digs; 317} 318 319sub atom { 320 my $self = shift; 321 my ($mol,$bnd,$iso,$sym,$chir,$hcount,$chg,$digs,$name) = @_; 322 #{no warnings; local $" = ','; print "atom(@_)\n"} 323 my $a = $self->{add_atom}($mol,$iso,$sym,$chir,$hcount,$chg,$name); 324 if($self->{stack}[-1]) { 325 $self->{add_bond}($mol, $bnd, $self->{stack}[-1], $a); 326 } 327 for my $dig (@$digs) { 328 if ($self->{digits}{$dig->{dig}}) { 329 if ($dig->{bnd} && $self->{digits}{$dig->{dig}}{bnd} 330 && $dig->{bnd} ne $self->{digits}{$dig->{dig}}{bnd}){ 331 die "SMILES: Inconsistent ring closure\n"; 332 } 333 $self->{add_bond}($mol, 334 $dig->{bnd} || $self->{digits}{$dig->{dig}}{bnd}, 335 $self->{digits}{$dig->{dig}}{atom}, $a); 336 delete $self->{digits}{$dig->{dig}}; 337 } else { 338 $self->{digits}{$dig->{dig}} = {atom=>$a, bnd=>$dig->{bnd}}; 339 } 340 } 341 $self->{stack}[-1] = $a; 342} 343 344#=back 345# 346#=head1 CALLBACK FUNCTIONS 347# 348#=over 349# 350#=item $atom = add_atom($mol, $iso, $sym, $chir, $hcount, $chg) 351# 352#Called by the parser whenever an atom is found. The first parameter is the 353#state object given to $obj->parse(). The other parameters are the isotope, 354#symbol, chirality, hydrogen count, and charge of the atom. Only the symbol is 355#guaranteed to be defined. Mnemonic: the parameters are given in the same order 356#that is used in a SMILES string (such as [18OH-]). This callback is expected to 357#return something that uniquely identifies the atom that was created (it might 358#be a number, a string, or an object). 359# 360#=cut 361 362# Default add_atom callback 363sub add_atom { 364 my ($mol, $iso, $sym, $chir, $hcount, $chg, $name) = @_; 365 my $atom = $mol->new_atom(symbol => ucfirst $sym, name => $name); 366 $iso && $atom->attr('smiles/isotope' => $iso); 367 $iso && $atom->mass($iso); 368 $chir && $atom->attr('smiles/chirality' => $chir); 369 defined $hcount && $atom->hydrogens($hcount); 370 $chg && $atom->formal_charge($chg); 371 if ($sym =~ /^[a-z]/) { 372 $atom->attr("smiles/aromatic", 1); 373 } 374 $atom; 375} 376 377#=item add_bond($mol, $type, $a1, $a2) 378# 379#Called by the parser whenever an bond needs to be created. The first parameter 380#is the state object given to $obj->parse(). The other parameters are the bond 381#type and the two atoms that need to be bonded. The atoms are identified using 382#the return values from the add_atom() callback. 383# 384#=back 385# 386#=end comment 387# 388#=cut 389 390# Default add_bond callback 391sub add_bond { 392 my ($mol, $type, $a1, $a2) = @_; 393 my $order = $type_to_order{$type} or return; # don't add bonds of order 0 394 my $bond = $mol->new_bond(type=>$type, atoms=>[$a1, $a2], order=>$order); 395 $bond->attr("smiles/type" => $type); 396 $bond; 397} 398 399sub start_branch { 400 my $self = shift; 401 #print "start_branch\n"; 402 push @{$self->{stack}}, $self->{stack}[-1]; 403} 404 405sub end_branch { 406 my $self = shift; 407 #print "end_branch\n"; 408 pop @{$self->{stack}}; 409} 410 411sub calc_implicit_hydrogens { 412 my ($self, $atom) = @_; 413 no warnings 'uninitialized'; 414 my $h_count = $ORGANIC_ELEMS{$atom->symbol} - $atom->valence; 415 if ($atom->attr("smiles/aromatic") and $atom->symbol =~ /^[CN]$/) { 416 $h_count--; 417 } 418 $h_count = 0 if $h_count < 0; 419 $h_count; 420} 421 422sub calc_implicit_hydrogens_2 { 423 my ($self, $atom) = @_; 424 my $h_count = $ORGANIC_ELEMS{$atom->symbol} - $atom->valence 425 + $atom->total_hydrogens; 426 $h_count = 0 if $h_count < 0; 427 $h_count; 428} 429 430sub add_implicit_hydrogens { 431 my ($self, $mol) = @_; 432 for my $atom ($mol->atoms) { 433 #print "H=".$atom->hydrogens."\n"; 434 unless (defined $atom->hydrogens) { 435 my $h_count = $self->calc_implicit_hydrogens($atom); 436 $atom->hydrogens($h_count); 437 } 438 } 439} 440 441##### SMILES WRITER ######## 442 443sub write_string { 444 my ($self, $mol_ref, %opts) = @_; 445 446 my $eol; 447 my @mols; 448 if ($opts{mols}) { 449 @mols = @{$opts{mols}}; 450 $eol = "\n"; 451 } else { 452 @mols = $mol_ref; 453 $eol = ""; 454 } 455 456 my $smiles; 457 for my $mol (@mols) { 458 $mol = $mol->clone; 459 $mol->collapse_hydrogens; 460 my @atoms = $mol->atoms; 461 462 if (@atoms) { 463 my $i; 464 if ($opts{auto_number}) { 465 $_->name(++$i) for @atoms; 466 $opts{number} = 1; 467 } 468 if ($opts{unique}) { 469 unless ($atoms[0]->attr("canon/class")) { 470 require Chemistry::Canonicalize; 471 Chemistry::Canonicalize::canonicalize($mol); 472 } 473 $opts{aromatic} = 1; # all unique smiles have to be aromatic 474 @atoms = sort { 475 $a->attr("canon/class") <=> $b->attr("canon/class") 476 } @atoms; 477 } 478 479 if ($opts{aromatic}) { 480 require Chemistry::Ring; 481 Chemistry::Ring::aromatize_mol($mol); 482 } 483 484 my $visited = {}; 485 my @s; 486 for my $atom (@atoms) { 487 next if $visited->{$atom}; 488 my $ring_atoms = {}; 489 490 # first pass to find and number the ring bonds 491 $self->find_ring_bonds($mol, \%opts, $atom, undef, {}, $ring_atoms); 492 493 # second pass to actually generate the SMILES string 494 push @s, $self->branch($mol, \%opts, $atom, undef, $visited, $ring_atoms); 495 } 496 $smiles .= join '.', @s; 497 } 498 499 if ($opts{name}) { 500 $smiles .= "\t" . $mol->name; 501 } 502 $smiles .= $eol; 503 } 504 return $smiles; 505} 506 507sub find_ring_bonds { 508 my ($self, $mol, $opts, $atom, $from_bond, $visited, $ring_atoms) = @_; 509 510 $visited->{$atom} = 1; 511 for my $bn ($self->sorted_bonds_neighbors($atom, $opts)) { 512 my $nei = $bn->{to}; 513 my $bond = $bn->{bond}; 514 next if $visited->{$bond}; 515 $visited->{$bond} = 1; 516 if ($visited->{$nei}) { # closed ring 517 #print "closing ring\n"; 518 $ring_atoms->{$nei}++; 519 } else { 520 $self->find_ring_bonds($mol, $opts, $nei, $bond, $visited, $ring_atoms); 521 } 522 } 523} 524 525sub branch { 526 my ($self, $mol, $opts, $atom, $from_bond, $visited, $digits) = @_; 527 528 my $prev_branch = ""; 529 my $smiles; 530 $smiles .= $self->bond_symbol($from_bond, $opts); 531 #$digits->{count}++; 532 $smiles .= $self->format_atom($atom, $opts); 533 if ($digits->{$atom}) { # opening a ring 534 my @d; 535 for (1 .. $digits->{$atom}) { 536 push @d, $self->next_digit($digits); 537 } 538 $digits->{$atom} = \@d; 539 $smiles .= join "", map { $_ < 10 ? $_ : "%$_"} @d; 540 } 541 542 $visited->{$atom} = 1; 543 my @bns = $self->sorted_bonds_neighbors($atom, $opts); 544 545 for my $bn (@bns) { 546 my $nei = $bn->{to}; 547 my $bond = $bn->{bond}; 548 next if $visited->{$bond}; 549 if ($visited->{$nei}) { # closed a ring 550 my $digit = shift @{$digits->{$nei}}; 551 $smiles .= $self->bond_symbol($bond, $opts); 552 $smiles .= $digit < 10 ? $digit : "%$digit"; 553 $digits->{used_digits}[$digit] = 0; # free for future use 554 $visited->{$bond} = 1; 555 } 556 } 557 558 for my $bn (@bns) { 559 my $nei = $bn->{to}; 560 my $bond = $bn->{bond}; 561 next if $visited->{$bond}; 562 $visited->{$bond} = 1; 563 unless ($visited->{$nei}) { 564 my $branch = $self->branch($mol, $opts, $nei, $bond, $visited, $digits); 565 if ($prev_branch) { 566 $smiles .= "($prev_branch)"; 567 } 568 $prev_branch = $branch; 569 } 570 } 571 $smiles .= "$prev_branch"; 572 $smiles; 573} 574 575sub next_digit { 576 my ($self, $digits) = @_; 577 for (my $i = 1; $i < 100; $i++) { 578 unless ($digits->{used_digits}[$i]) { 579 $digits->{used_digits}[$i] = 1; # mark as used 580 return $i; 581 } 582 } 583 die "no more available smiles digits!"; # shouldn't happen 584} 585 586sub sorted_bonds_neighbors { 587 my ($self, $atom, $opts) = @_; 588 my @bn = $atom->bonds_neighbors; 589 if ($opts->{unique}) { 590 @bn = sort { 591 $a->{to}->attr("canon/class") <=> $b->{to}->attr("canon/class") 592 } @bn; 593 } 594 @bn; 595} 596 597my %ORDER_TO_TYPE = ( 598 2 => '=', 1 => '', 3 => '#', 599); 600 601sub bond_symbol { 602 my ($self, $bond, $opts) = @_; 603 return '' unless $bond; 604 return '' if $opts->{aromatic} && $bond->aromatic; 605 return $ORDER_TO_TYPE{$bond->order}; 606} 607 608sub format_atom { 609 my ($self, $atom, $opts) = @_; 610 611 my $symbol = $atom->symbol; 612 $symbol = lc $symbol if $opts->{aromatic} && $atom->aromatic; 613 my $s = $symbol; 614 615 # unless atom is "simple"... 616 if (!$ORGANIC_ELEMS{$atom->symbol} || $atom->formal_charge 617 || $atom->total_hydrogens != $self->calc_implicit_hydrogens_2($atom) 618 || ($opts->{number} && defined $atom->name) 619 ) { 620 # "complex atom"; bracketed 621 my $h_count = $atom->hydrogens; 622 my $charge = $atom->formal_charge || ''; 623 my $iso = $atom->attr("smiles/isotope") || ''; 624 my $number = ''; 625 626 if ($charge and abs($charge) > 1) { 627 $charge = sprintf("%+d", $charge); 628 } elsif ($charge) { 629 $charge = $charge > 0 ? '+' : '-'; 630 } 631 632 $h_count = $h_count ? ($h_count > 1 ? "H$h_count" : 'H') : ''; 633 634 $number = ':' . $atom->name if $opts->{number} and defined $atom->name; 635 636 $s = "[$iso$symbol$h_count$charge$number]"; 637 } 638 $s; 639} 640 641 6421; 643 644=head1 CAVEATS 645 646Stereochemistry is not supported! Stereochemical descriptors such as @, @@, /, 647and \ will be silently ignored on input, and will certainly not be produced on 648output. 649 650Reading branches that start before an atom, such as (OC)C, which should be 651equivalent to C(OC) and COC, according to some variants of the SMILES 652specification. Many other tools don't implement this rule either. 653 654The kekulize option works by increasing the bond orders of atoms that don't 655have their usual valences satisfied. This may cause problems if you have atoms 656with explicitly low hydrogen counts. 657 658=head1 VERSION 659 6600.44 661 662=head1 SEE ALSO 663 664L<Chemistry::Mol>, L<Chemistry::File> 665 666The SMILES Home Page at http://www.daylight.com/dayhtml/smiles/ 667 668The Daylight Theory Manual at 669http://www.daylight.com/dayhtml/doc/theory/theory.smiles.html 670 671The PerlMol website L<http://www.perlmol.org/> 672 673=head1 AUTHOR 674 675Ivan Tubert-Brohman E<lt>itub@cpan.orgE<gt> 676 677=head1 COPYRIGHT 678 679Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is 680free software; you can redistribute it and/or modify it under the same terms as 681Perl itself. 682 683=cut 684 685