1package Chemistry::File::SMILES;
2
3$VERSION = "0.44";
4# $Id: SMILES.pm,v 1.14 2005/03/29 23:48:34 itubert Exp $
5
6use 5.006;
7use strict;
8use warnings;
9use base "Chemistry::File";
10use Chemistry::Mol;
11use Chemistry::Bond::Find 'assign_bond_orders';
12use List::Util 'first';
13use Carp;
14
15
16=head1 NAME
17
18Chemistry::File::SMILES - SMILES linear notation parser/writer
19
20=head1 SYNOPSYS
21
22    #!/usr/bin/perl
23    use Chemistry::File::SMILES;
24
25    # parse a SMILES string
26    my $s = 'C1CC1(=O)[O-]';
27    my $mol = Chemistry::Mol->parse($s, format => 'smiles');
28
29    # print a SMILES string
30    print $mol->print(format => 'smiles');
31
32    # print a unique (canonical) SMILES string
33    print $mol->print(format => 'smiles', unique => 1);
34
35    # parse a SMILES file
36    my @mols = Chemistry::Mol->read("file.smi", format => 'smiles');
37
38    # write a multiline SMILES file
39    Chemistry::Mol->write("file.smi", mols => \@mols);
40
41
42=head1 DESCRIPTION
43
44This module parses a SMILES (Simplified Molecular Input Line Entry
45Specification) string. This is a File I/O driver for the PerlMol project.
46L<http://www.perlmol.org/>. It registers the 'smiles' format with
47Chemistry::Mol.
48
49This parser interprets anything after whitespace as the molecule's name;
50for example, when the following SMILES string is parsed, $mol->name will be
51set to "Methyl chloride":
52
53    CCl	 Methyl chloride
54
55The name is not included by default on output. However, if the C<name> option
56is defined, the name will be included after the SMILES string, separated by a
57tab.
58
59    print $mol->print(format => 'smiles', name => 1);
60
61=head2 Multiline SMILES and SMILES files
62
63A file or string can contain multiple molecules, one per line.
64
65    CCl	 Methyl chloride
66    CO	 Methanol
67
68Files with the extension '.smi' are assumed to have this format.
69
70=head2 Atom Mapping Numbers
71
72As an extension for reaction processing, SMILES strings may have atom mapping
73numbers, which are introduced after a colon in a bracketed atom. For example,
74[C:1]. The mapping number need not be unique. This module reads the mapping
75numbers and stores them as the name of the atom ($atom->name).
76
77On output, atom names are not included by default. See the C<number> and
78C<auto_number> options below for ways of including them.
79
80head1 OPTIONS
81
82The following options are supported in addition to the options mentioned for
83L<Chemistry::File>, such as C<mol_class>, C<format>, and C<fatal>.
84
85=over
86
87=item aromatic
88
89On output, detect aromatic atoms and bonds by means of the Chemistry::Ring
90module, and represent the organic aromatic atoms with lowercase symbols.
91
92=item unique
93
94When used on output, canonicalize the structure if it hasn't been canonicalized
95already and generate a unique SMILES string. This option implies "aromatic".
96
97=item number
98
99For atoms that have a defined name, print the name as the "atom number". For
100example, if an ethanol molecule has the name "42" for the oxygen atom and the
101other atoms have undefined names, the output would be:
102
103    CC[OH:42]
104
105=item auto_number
106
107When used on output, number all the atoms explicitly and sequentially. The
108output for ethanol would look something like this:
109
110    [CH3:1][CH2:2][OH:3]
111
112=item name
113
114Include the molecule name on output, as described in the previous section.
115
116=item kekulize
117
118When used on input, assign single or double bond orders to "aromatic" or
119otherwise unspecified bonds (i.e., generate the Kekule structure). If false,
120the bond orders will remain single. This option is true by default. This uses
121C<assign_bond_orders> from the L<Chemistry::Bond::Find> module.
122
123=back
124
125=cut
126
127# INITIALIZATION
128Chemistry::Mol->register_format('smiles');
129my $Smiles_parser = __PACKAGE__->new_parser;
130
131#=begin comment
132#
133#=over
134#
135#=cut
136
137sub file_is {
138    my $self = shift;
139    $self->name_is(@_);
140}
141
142sub name_is {
143    my ($self, $name) = @_;
144    $name =~ /\.smi/;
145}
146
147sub slurp_mol {
148    my ($self, $fh) = @_;
149    scalar <$fh>;
150}
151
152sub read_mol {
153    my ($self, $fh, %opts) = @_;
154    %opts = (kekulize => 1, %opts);
155    my $mol_class = $opts{mol_class} || "Chemistry::Mol";
156
157    my $line = <$fh>;
158    return unless defined $line;
159    $line =~ tr/\r\n//d;
160    my ($smiles, $name) = split " ", $line, 2;
161
162    my $mol = $mol_class->new;
163    unless ($Smiles_parser->parse($smiles, $mol, \%opts)) {
164        warn "error parsing SMILES line '$line'\n";
165        $mol = $mol_class->new;
166    }
167    $mol->name($name);
168    $self->add_implicit_hydrogens($mol);
169    if ($opts{kekulize}) {
170        assign_bond_orders($mol, method => "itub", use_coords => 0,
171            scratch => 0, charges => 0);
172    }
173    $mol;
174}
175
176
177### The contents of the original Chemistry::Smiles module start below
178
179my $Symbol = qr/
180    s|p|o|n|c|b|Zr|Zn|Yb|Y|Xe|W|V|U|Tm|Tl|Ti|Th|
181    Te|Tc|Tb|Ta|Sr|Sn|Sm|Si|Sg|Se|Sc|Sb|S|Ru|Rn|Rh|Rf|Re|Rb|Ra|
182    Pu|Pt|Pr|Po|Pm|Pd|Pb|Pa|P|Os|O|Np|No|Ni|Ne|Nd|Nb|Na|N|Mt|Mt|
183    Mo|Mn|Mg|Md|Lu|Lr|Li|La|Kr|K|Ir|In|I|Hs|Hs|Ho|Hg|Hf|He|H|Ge|
184    Gd|Ga|Fr|Fm|Fe|F|Eu|Es|Er|Dy|Ds|Db|Cu|Cs|Cr|Co|Cm|Cl|Cf|Ce|
185    Cd|Ca|C|Br|Bk|Bi|Bh|Be|Ba|B|Au|At|As|Ar|Am|Al|Ag|Ac|\*
186/x; # Order is reverse alphabetical to ensure longest match
187
188my $Simple_symbol = qr/Br|Cl|B|C|N|O|P|S|F|I|H|s|p|o|n|c|b/;
189
190my $Bond = qr/(?:[-=#:.\/\\])?/;
191my $Simple_atom = qr/($Simple_symbol)/;   #3
192my $Complex_atom = qr/
193    (?:
194        \[                          #begin atom
195        (\d*)                       #4 isotope
196        ($Symbol)                   #5 symbol
197        (\@{0,2})                   #6 chirality
198        (?:(H\d*))?                 #7 H-count
199        (\+{2,}|-{2,}|\+\d*|-\d*)?  #8 charge
200        (?::(\d+))?                 #9 name
201        \]                          #end atom
202    )
203/x;
204
205my $Digits = qr/(?:($Bond)(?:\d|%\d\d))*/;
206my $Chain = qr/
207    \G(                                     #1
208        (?:
209            ($Bond)                         #2
210            (?:$Simple_atom|$Complex_atom)  #3-9
211            ($Digits)                       #10
212        )
213        |\(
214        |\)
215        |.+
216    )
217/x;
218
219my $digits_re = qr/($Bond)(\%\d\d|\d)/;
220
221my %type_to_order = (
222    '-' => 1,
223    '=' => 2,
224    '#' => 3,
225    '/' => 1,
226    '\\' => 1,
227    '' => 1, # not strictly true
228    '.' => 0,
229);
230
231my %ORGANIC_ELEMS = (
232    Br => 1, Cl => 1, B => 3, C => 4, N => 3, O => 2, P => 3, S => 2,
233    F => 1, I => 1, s => 1, p => 1, o => 1, n => 1, c => 1, b => 1,
234);
235
236#=item Chemistry::Smiles->new([add_atom => \&sub1, add_bond => \&sub2])
237#
238#Create a SMILES parser. If the add_atom and add_bond subroutine references
239#are given, they will be called whenever an atom or a bond needs to be added
240#to the molecule. If they are not specified, default methods, which
241#create a Chemistry::Mol object, will be used.
242#
243#=cut
244
245sub new_parser {
246    my $class = shift;
247    my %opts = @_;
248    my $self = bless {
249        add_atom => $opts{add_atom} || \&add_atom,
250        add_bond => $opts{add_bond} || \&add_bond,
251    }, $class;
252}
253
254#=item $obj->parse($string, $mol)
255#
256#Parse a Smiles $string. $mol is a "molecule state object". It can be anything;
257#the parser doesn't do anything with it except sending it as the first parameter
258#to the callback functions. If callback functions were not provided when
259#constructing the parser object, $mol must be a Chemistry::Mol object, because
260#that's what the default callback functions require.
261#
262#=cut
263
264sub parse {
265    my ($self, $s, $mol, $opts) = @_;
266    $self->{stack} = [ undef ];
267    $self->{digits} = {};
268
269    eval {
270        while ($s =~ /$Chain/g) {
271            #my @a = ($1, $2, $3, $4, $5, $6, $7, $8);
272            #print Dumper(\@a);
273            my ($all, $bnd, $sym, $iso, $sym2, $chir, $hcnt, $chg, $name, $dig)
274                = ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10);
275            if ($all eq '(') {
276                $self->start_branch();
277            } elsif ($all eq ')') {
278                $self->end_branch();
279            } elsif ($sym) { # Simple atom
280                no warnings;
281                my @digs = parse_digits($dig);
282                $self->atom($mol, $bnd, '', $sym, '', undef, '', \@digs);
283            } elsif ($sym2) { # Complex atom
284                no warnings;
285                my @digs = parse_digits($dig);
286                if ($hcnt eq 'H') {
287                    $hcnt = 1;
288                } else {
289                    $hcnt =~ s/H//;
290                }
291                unless ($chg =~ /\d/) {
292                    $chg = ($chg =~ /-/) ? -length($chg) : length($chg);
293                }
294                $self->atom($mol, $bnd, $iso, $sym2, $chir, $hcnt || 0,
295                    $chg, \@digs, $name);
296            } else {
297                die "SMILES ERROR: '$all in $s'\n";
298            }
299        }
300    };
301    # clean up to avoid memory leak
302    $self->{stack} = undef;
303    if ($@) {
304        croak $@ if $opts->{fatal};
305        return;
306    }
307    $mol;
308}
309
310sub parse_digits {
311    my ($dig) = @_;
312    my @digs;
313    while ($dig && $dig =~ /$digits_re/g) {
314        push @digs, {bnd=>$1, dig=>$2};
315    }
316    @digs;
317}
318
319sub atom {
320    my $self = shift;
321    my ($mol,$bnd,$iso,$sym,$chir,$hcount,$chg,$digs,$name) = @_;
322    #{no warnings; local $" = ','; print "atom(@_)\n"}
323    my $a = $self->{add_atom}($mol,$iso,$sym,$chir,$hcount,$chg,$name);
324    if($self->{stack}[-1]) {
325        $self->{add_bond}($mol, $bnd, $self->{stack}[-1], $a);
326    }
327    for my $dig (@$digs) {
328        if ($self->{digits}{$dig->{dig}}) {
329            if ($dig->{bnd} && $self->{digits}{$dig->{dig}}{bnd}
330                &&  $dig->{bnd} ne $self->{digits}{$dig->{dig}}{bnd}){
331                die "SMILES: Inconsistent ring closure\n";
332            }
333            $self->{add_bond}($mol,
334                $dig->{bnd} || $self->{digits}{$dig->{dig}}{bnd},
335                $self->{digits}{$dig->{dig}}{atom}, $a);
336            delete $self->{digits}{$dig->{dig}};
337        } else {
338            $self->{digits}{$dig->{dig}} = {atom=>$a, bnd=>$dig->{bnd}};
339        }
340    }
341    $self->{stack}[-1] = $a;
342}
343
344#=back
345#
346#=head1 CALLBACK FUNCTIONS
347#
348#=over
349#
350#=item $atom = add_atom($mol, $iso, $sym, $chir, $hcount, $chg)
351#
352#Called by the parser whenever an atom is found. The first parameter is the
353#state object given to $obj->parse(). The other parameters are the isotope,
354#symbol, chirality, hydrogen count, and charge of the atom. Only the symbol is
355#guaranteed to be defined. Mnemonic: the parameters are given in the same order
356#that is used in a SMILES string (such as [18OH-]). This callback is expected to
357#return something that uniquely identifies the atom that was created (it might
358#be a number, a string, or an object).
359#
360#=cut
361
362# Default add_atom callback
363sub add_atom {
364    my ($mol, $iso, $sym, $chir, $hcount, $chg, $name) = @_;
365    my $atom = $mol->new_atom(symbol => ucfirst $sym, name => $name);
366    $iso && $atom->attr('smiles/isotope' => $iso);
367    $iso && $atom->mass($iso);
368    $chir && $atom->attr('smiles/chirality' => $chir);
369    defined $hcount && $atom->hydrogens($hcount);
370    $chg && $atom->formal_charge($chg);
371    if ($sym =~ /^[a-z]/) {
372        $atom->attr("smiles/aromatic", 1);
373    }
374    $atom;
375}
376
377#=item add_bond($mol, $type, $a1, $a2)
378#
379#Called by the parser whenever an bond needs to be created. The first parameter
380#is the state object given to $obj->parse(). The other parameters are the bond
381#type and the two atoms that need to be bonded. The atoms are identified using
382#the return values from the add_atom() callback.
383#
384#=back
385#
386#=end comment
387#
388#=cut
389
390# Default add_bond callback
391sub add_bond {
392    my ($mol, $type, $a1, $a2) = @_;
393    my $order = $type_to_order{$type} or return; # don't add bonds of order 0
394    my $bond = $mol->new_bond(type=>$type, atoms=>[$a1, $a2], order=>$order);
395    $bond->attr("smiles/type" => $type);
396    $bond;
397}
398
399sub start_branch {
400    my $self = shift;
401    #print "start_branch\n";
402    push @{$self->{stack}}, $self->{stack}[-1];
403}
404
405sub end_branch {
406    my $self = shift;
407    #print "end_branch\n";
408    pop @{$self->{stack}};
409}
410
411sub calc_implicit_hydrogens {
412    my ($self, $atom) = @_;
413    no warnings 'uninitialized';
414    my $h_count = $ORGANIC_ELEMS{$atom->symbol} - $atom->valence;
415    if ($atom->attr("smiles/aromatic") and $atom->symbol =~ /^[CN]$/) {
416        $h_count--;
417    }
418    $h_count = 0 if $h_count < 0;
419    $h_count;
420}
421
422sub calc_implicit_hydrogens_2 {
423    my ($self, $atom) = @_;
424    my $h_count = $ORGANIC_ELEMS{$atom->symbol} - $atom->valence
425        + $atom->total_hydrogens;
426    $h_count = 0 if $h_count < 0;
427    $h_count;
428}
429
430sub add_implicit_hydrogens {
431    my ($self, $mol) = @_;
432    for my $atom ($mol->atoms) {
433        #print "H=".$atom->hydrogens."\n";
434        unless (defined $atom->hydrogens) {
435            my $h_count = $self->calc_implicit_hydrogens($atom);
436            $atom->hydrogens($h_count);
437        }
438    }
439}
440
441##### SMILES WRITER ########
442
443sub write_string {
444    my ($self, $mol_ref, %opts) = @_;
445
446    my $eol;
447    my @mols;
448    if ($opts{mols}) {
449        @mols = @{$opts{mols}};
450        $eol = "\n";
451    } else {
452        @mols = $mol_ref;
453        $eol = "";
454    }
455
456    my $smiles;
457    for my $mol (@mols) {
458        $mol = $mol->clone;
459        $mol->collapse_hydrogens;
460        my @atoms = $mol->atoms;
461
462        if (@atoms) {
463            my $i;
464            if ($opts{auto_number}) {
465                $_->name(++$i) for @atoms;
466                $opts{number} = 1;
467            }
468            if ($opts{unique}) {
469                unless ($atoms[0]->attr("canon/class")) {
470                    require Chemistry::Canonicalize;
471                    Chemistry::Canonicalize::canonicalize($mol);
472                }
473                $opts{aromatic} = 1; # all unique smiles have to be aromatic
474                @atoms = sort {
475                    $a->attr("canon/class") <=> $b->attr("canon/class")
476                } @atoms;
477            }
478
479            if ($opts{aromatic}) {
480                require Chemistry::Ring;
481                Chemistry::Ring::aromatize_mol($mol);
482            }
483
484            my $visited = {};
485            my @s;
486            for my $atom (@atoms) {
487                next if $visited->{$atom};
488                my $ring_atoms = {};
489
490                # first pass to find and number the ring bonds
491                $self->find_ring_bonds($mol, \%opts, $atom, undef, {}, $ring_atoms);
492
493                # second pass to actually generate the SMILES string
494                push @s, $self->branch($mol, \%opts, $atom, undef, $visited, $ring_atoms);
495            }
496            $smiles .= join '.', @s;
497        }
498
499        if ($opts{name}) {
500            $smiles .= "\t" . $mol->name;
501        }
502        $smiles .= $eol;
503    }
504    return $smiles;
505}
506
507sub find_ring_bonds {
508    my ($self, $mol, $opts, $atom, $from_bond, $visited, $ring_atoms) = @_;
509
510    $visited->{$atom}  = 1;
511    for my $bn ($self->sorted_bonds_neighbors($atom, $opts)) {
512        my $nei  = $bn->{to};
513        my $bond = $bn->{bond};
514        next if $visited->{$bond};
515        $visited->{$bond}  = 1;
516        if ($visited->{$nei}) { # closed ring
517            #print "closing ring\n";
518            $ring_atoms->{$nei}++;
519        } else {
520            $self->find_ring_bonds($mol, $opts, $nei, $bond, $visited, $ring_atoms);
521        }
522    }
523}
524
525sub branch {
526    my ($self, $mol, $opts, $atom, $from_bond, $visited, $digits) = @_;
527
528    my $prev_branch = "";
529    my $smiles;
530    $smiles .= $self->bond_symbol($from_bond, $opts);
531    #$digits->{count}++;
532    $smiles .= $self->format_atom($atom, $opts);
533    if ($digits->{$atom}) {  # opening a ring
534        my @d;
535        for (1 .. $digits->{$atom}) {
536            push @d, $self->next_digit($digits);
537        }
538        $digits->{$atom} = \@d;
539        $smiles .= join "", map { $_ < 10 ? $_ : "%$_"} @d;
540    }
541
542    $visited->{$atom}  = 1;
543    my @bns = $self->sorted_bonds_neighbors($atom, $opts);
544
545    for my $bn (@bns) {
546        my $nei  = $bn->{to};
547        my $bond = $bn->{bond};
548        next if $visited->{$bond};
549        if ($visited->{$nei}) { # closed a ring
550            my $digit = shift @{$digits->{$nei}};
551            $smiles .= $self->bond_symbol($bond, $opts);
552            $smiles .= $digit < 10 ? $digit : "%$digit";
553            $digits->{used_digits}[$digit] = 0; # free for future use
554            $visited->{$bond} = 1;
555        }
556    }
557
558    for my $bn (@bns) {
559        my $nei  = $bn->{to};
560        my $bond = $bn->{bond};
561        next if $visited->{$bond};
562        $visited->{$bond} = 1;
563        unless ($visited->{$nei}) {
564            my $branch = $self->branch($mol, $opts, $nei, $bond, $visited, $digits);
565            if ($prev_branch) {
566                $smiles .= "($prev_branch)";
567            }
568            $prev_branch = $branch;
569        }
570    }
571    $smiles .= "$prev_branch";
572    $smiles;
573}
574
575sub next_digit {
576    my ($self, $digits) = @_;
577    for (my $i = 1; $i < 100; $i++) {
578        unless ($digits->{used_digits}[$i]) {
579            $digits->{used_digits}[$i] = 1;  # mark as used
580            return $i;
581        }
582    }
583    die "no more available smiles digits!";  # shouldn't happen
584}
585
586sub sorted_bonds_neighbors {
587    my ($self, $atom, $opts) = @_;
588    my @bn = $atom->bonds_neighbors;
589    if ($opts->{unique}) {
590        @bn = sort {
591            $a->{to}->attr("canon/class") <=> $b->{to}->attr("canon/class")
592        } @bn;
593    }
594    @bn;
595}
596
597my %ORDER_TO_TYPE = (
598    2 => '=', 1 => '', 3 => '#',
599);
600
601sub bond_symbol {
602    my ($self, $bond, $opts) = @_;
603    return '' unless $bond;
604    return '' if $opts->{aromatic} && $bond->aromatic;
605    return $ORDER_TO_TYPE{$bond->order};
606}
607
608sub format_atom {
609    my ($self, $atom, $opts) = @_;
610
611    my $symbol  = $atom->symbol;
612    $symbol = lc $symbol if $opts->{aromatic} && $atom->aromatic;
613    my $s = $symbol;
614
615    # unless atom is "simple"...
616    if (!$ORGANIC_ELEMS{$atom->symbol} || $atom->formal_charge
617        || $atom->total_hydrogens != $self->calc_implicit_hydrogens_2($atom)
618        || ($opts->{number} && defined $atom->name)
619    ) {
620        # "complex atom"; bracketed
621        my $h_count = $atom->hydrogens;
622        my $charge  = $atom->formal_charge || '';
623        my $iso     = $atom->attr("smiles/isotope") || '';
624        my $number = '';
625
626        if ($charge and abs($charge) > 1) {
627            $charge = sprintf("%+d", $charge);
628        } elsif ($charge) {
629            $charge = $charge > 0 ? '+' : '-';
630        }
631
632        $h_count = $h_count ? ($h_count > 1 ? "H$h_count" : 'H') : '';
633
634        $number = ':' . $atom->name if $opts->{number} and defined $atom->name;
635
636        $s = "[$iso$symbol$h_count$charge$number]";
637    }
638    $s;
639}
640
641
6421;
643
644=head1 CAVEATS
645
646Stereochemistry is not supported! Stereochemical descriptors such as @, @@, /,
647and \ will be silently ignored on input, and will certainly not be produced on
648output.
649
650Reading branches that start before an atom, such as (OC)C, which should be
651equivalent to C(OC) and COC, according to some variants of the SMILES
652specification. Many other tools don't implement this rule either.
653
654The kekulize option works by increasing the bond orders of atoms that don't
655have their usual valences satisfied. This may cause problems if you have atoms
656with explicitly low hydrogen counts.
657
658=head1 VERSION
659
6600.44
661
662=head1 SEE ALSO
663
664L<Chemistry::Mol>, L<Chemistry::File>
665
666The SMILES Home Page at http://www.daylight.com/dayhtml/smiles/
667
668The Daylight Theory Manual at
669http://www.daylight.com/dayhtml/doc/theory/theory.smiles.html
670
671The PerlMol website L<http://www.perlmol.org/>
672
673=head1 AUTHOR
674
675Ivan Tubert-Brohman E<lt>itub@cpan.orgE<gt>
676
677=head1 COPYRIGHT
678
679Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is
680free software; you can redistribute it and/or modify it under the same terms as
681Perl itself.
682
683=cut
684
685