1package Chemistry::File::SLN;
2
3$VERSION = "0.11";
4# $Id: SLN.pm,v 1.4 2005/03/29 16:38:06 itubert Exp $
5
6use 5.006;
7use strict;
8use warnings;
9use base "Chemistry::File";
10use Chemistry::Mol;
11use Chemistry::File::SLN::Parser;
12use Chemistry::Bond::Find 'assign_bond_orders';
13use List::Util qw(sum);
14
15=head1 NAME
16
17Chemistry::File::SLN - SLN linear notation parser/writer
18
19=head1 SYNOPSYS
20
21    #!/usr/bin/perl
22    use Chemistry::File::SLN;
23
24    # parse a SLN string for benzene
25    my $s = 'C[1]H:CH:CH:CH:CH:CH@1';
26    my $mol = Chemistry::Mol->parse($s, format => 'sln');
27
28    # print a SLN string
29    print $mol->print(format => 'sln');
30
31    # print a unique (canonical) SLN string
32    print $mol->print(format => 'sln', unique => 1);
33
34    # parse a multiline SLN file
35    my @mols = Chemistry::Mol->read("file.sln", format => 'sln');
36
37    # write a multiline SLN file
38    Chemistry::Mol->write("file.sln", mols => [@mols]);
39
40
41=head1 DESCRIPTION
42
43This module parses a SLN (Sybyl Line Notation) string. This is a File I/O
44driver for the PerlMol project.  L<http://www.perlmol.org/>. It registers the
45'sln' format with Chemistry::Mol, and recognizes filenames ending in '.sln'.
46
47Optional attributes for atoms, bonds, and molecules are stored as
48$atom->attr("sln/attr"), $bond->attr("sln/attr"), and $mol->attr("sln/attr"),
49respectively. Boolean attributes are stored with a value of 'TRUE'. That's the
50way boolean attributes are recognized when writing, so that they can be written
51in the shortened form.
52
53    $sln_attr->{backbone} = 1;
54    # would be ouput as "C[backbone=1]"
55
56    $sln_attr->{backbone} = 'TRUE';
57    # would be ouput as "C[backbone]"
58
59Also note that attribute names are normalized to lowercase on reading.
60
61=head1 OPTIONS
62
63The following options are available when reading:
64
65=over
66
67=item kekulize
68
69Assign bond orders for unsatisfied valences or for aromatic bonds. For example,
70benzene read as C[1]H:CH:CH:CH:CH:CH@1 will be converted internally to
71something like C[1]H=CHCH=CHCH=CH@1. This is needed if another format or
72module expects a Kekule representation without an aromatic bond type.
73
74=back
75
76The following options are available when writing:
77
78=over
79
80=item mols
81
82If this option points to an array of molecules, these molecules will be
83written, one per line, as in the example in the SYNOPSYS.
84
85=item aromatic
86
87Detect aromaticity before writing. This will ensure that aromatic bond types
88are used instead of alternate single and double bonds.
89
90=item unique
91
92Canonicalize before writing, and produce a unique strucure. NOTE: this option
93does not guarantee a unique representation for molecules with bracketed
94attributes.
95
96=item name
97
98Include the name of the molecule ($mol->name) in the output string.
99
100=item coord3d, coords
101
102Include the 3D coordinates of every atom in the molecule in the output string.
103C<coord3d> and C<coords> may be used interchangeably.
104
105=item attr
106
107Output the atom, bond, and molecule attributes found in $mol->attr("sln/attr"),
108etc.
109
110=back
111
112=head1 CAVEATS
113
114This version does not implement the full SLN specification. It supports
115simple structures and some attributes, but it does not support any of the
116following:
117
118=over
119
120=item Macro atoms
121
122=item Pattern matching options
123
124=item Markush structures
125
126=item 2D Coordinates
127
128=back
129
130The SLN specification is vague on several points, and I don't have a reference
131implementation available, so I had to make several arbitrary decisions. Also,
132this version of this module has not been tested exhaustively, so please report
133any bugs that you find.
134
135If the parser doesn't understand a string, it only says "syntax error", which
136may not be very helpful.
137
138=cut
139
140# INITIALIZATION
141Chemistry::Mol->register_format('sln');
142my $Parser = Chemistry::File::SLN::Parser->new;
143
144sub name_is {
145    my ($self, $name) = @_;
146    $name =~ /\.sln$/i;
147}
148
149sub file_is {
150    $_[0]->name_is($_[1]);
151}
152
153sub parse_string {
154    my ($self, $string, %opts) = @_;
155
156    my (@lines) = split /(?:\n|\r\n?)/, $string;
157    my @mols;
158    for my $line (@lines) {
159        my $mol = $self->parse_single_line($line, %opts);
160        return $mol unless wantarray;
161        push @mols, $mol;
162    }
163    @mols;
164}
165
166sub parse_single_line {
167    my ($self, $string, %opts) = @_;
168
169    my $mol_class = $opts{mol_class} || "Chemistry::Mol";
170
171
172    # call the actual yapp-generated parser
173    my $tree = $Parser->run($string) or return;
174    #use Data::Dumper; print Dumper $tree;
175
176    my $mol = $mol_class->new;
177    my @nodes = @{$tree->{chain}};
178    my %closures;
179    my $last_atom;
180    my @stack;
181
182    while (my $node = shift @nodes) {
183        if ($node eq '(') {
184            push @stack, $last_atom;
185        } elsif ($node eq ')') {
186            $last_atom = pop @stack;
187        } elsif($last_atom) { # bond
188            my $next = shift @nodes;
189            if ($next->{closure}) {
190                my $atom = $closures{$next->{closure}};
191                $self->compile_bond($mol, $node, $last_atom, $atom);
192            } else {
193                my $atom = $self->compile_atom($mol, $next, \%closures);
194                $self->compile_bond($mol, $node, $last_atom, $atom);
195                $last_atom = $atom;
196            }
197        } else {  # first atom
198            $last_atom = $self->compile_atom($mol, $node, \%closures);
199        }
200    }
201    if ($opts{kekulize}) {
202        assign_bond_orders($mol, method => "itub", use_coords => 0,
203            scratch => 0, charges => 0);
204    }
205    my @sln_attr;
206    while (my ($attr, $value) = each %{$tree->{attr}}) {
207        if ($attr eq 'name') {
208            $mol->name($value);
209        } elsif ($attr eq 'type') {
210            $mol->type($value);
211        } elsif ($attr eq 'coord3d') {
212            $self->read_coords($mol, $value);
213        } else {
214            push @sln_attr, $attr, $value;
215        }
216    }
217    $mol->attr("sln/attr", {@sln_attr}) if @sln_attr;
218    $mol;
219
220}
221
222sub compile_atom {
223    my ($self, $mol, $node, $closures) = @_;
224    my $atom = $mol->new_atom(
225        symbol          => $node->{symbol},
226        hydrogens       => $node->{hcount},
227        formal_charge   => $node->{attr}{charge},
228    );
229    $atom->attr("sln/attr", $node->{attr});
230    delete $node->{attr}{charge};
231    $closures->{$node->{id}} = $atom if $node->{id};
232    $atom;
233}
234
235my %TYPE_TO_ORDER = (
236    '-' => 1,
237    '=' => 2,
238    '#' => 3,
239    ':' => 1,
240    '.' => 0,
241);
242
243sub compile_bond {
244    my ($self, $mol, $node, $atom1, $atom2) = @_;
245    my $order = $TYPE_TO_ORDER{$node->{type}};
246    if ($order) {
247        my $bond = $mol->new_bond(
248            type => $node->{type},
249            atoms=>[$atom1, $atom2],
250            order => $order,
251        );
252        $bond->attr("sln/attr", $node->{attr});
253        if ($node->{type} eq ':') {
254            $_->aromatic(1) for ($atom1, $atom2, $bond);
255        }
256    }
257}
258
259sub read_coords {
260    my ($self, $mol, $coords_str) = @_;
261    $coords_str =~ s/[()]//g;
262    my (@coords) = split /,/, $coords_str;
263    my $fh = $mol->formula_hash;
264    my $n = sum(values %$fh);
265    my $sprout = (@coords == 3*$n);
266    for my $atom ($mol->atoms) {
267        $atom->coords(splice @coords, 0, 3);
268        if ($sprout) {
269            for (1 .. $atom->implicit_hydrogens) {
270                my $H = $mol->new_atom(symbol => 'H',
271                    coords => [splice @coords, 0, 3]);
272                $mol->new_bond(atoms => [$atom, $H]);
273            }
274            $atom->implicit_hydrogens(0);
275        }
276    }
277}
278
279
280########### WRITER #################
281
282
283sub write_string {
284    my ($self, $mol_ref, %opts) = @_;
285
286    my $eol;
287    my @mols;
288    if ($opts{mols}) {
289        @mols = @{$opts{mols}};
290        $eol = "\n";
291    } else {
292        @mols = $mol_ref;
293        $eol = "";
294    }
295
296    my $sln;
297    for my $mol (@mols) {
298        $sln .= $self->write_mol($mol, %opts) . $eol;
299    }
300    $sln;
301}
302
303sub write_mol {
304    my ($self, $mol, %opts) = @_;
305
306    my $oldmol = $mol;
307    $mol = $mol->clone;
308
309    my $sln = '';
310    my @id_log;
311    if ($mol->atoms) {
312        my @atoms = $self->clean_mol($mol, %opts);
313
314        my $visited = {};
315        my @s;
316        for my $atom (@atoms) {
317            next if $visited->{$atom};
318            my $ring_atoms = {};
319
320            # first pass to find and number the ring bonds
321            $self->find_ring_bonds($mol, \%opts, $atom, undef, {}, $ring_atoms);
322
323            # second pass to actually generate the sln string
324            push @s, $self->branch($mol, \%opts, $atom, undef, $visited,
325                $ring_atoms, \@id_log);
326        }
327        $sln .= join '.', @s;
328    }
329
330    $sln .= $self->format_ctab_attr($mol, \%opts, $oldmol, \@id_log);
331}
332
333sub clean_mol {
334    my ($self, $mol, %opts) = @_;
335
336    $self->collapse_hydrogens($mol);
337    my @atoms = $mol->atoms;
338    if ($opts{unique}) {
339        unless ($atoms[0]->attr("canon/class")) {
340            require Chemistry::Canonicalize;
341            Chemistry::Canonicalize::canonicalize($mol);
342        }
343        #$opts{aromatic} = 1; # all unique sln have to be aromatic
344        @atoms = sort {
345            $a->attr("canon/class") <=> $b->attr("canon/class")
346        } @atoms;
347    }
348
349    if ($opts{aromatic}) {
350        require Chemistry::Ring;
351        Chemistry::Ring::aromatize_mol($mol);
352    }
353    @atoms;
354}
355
356sub format_ctab_attr {
357    my ($self, $mol, $opts, $oldmol, $id_log) = @_;
358
359    my $sln = '';
360    if ($opts->{name} or $opts->{attr} or $opts->{coords} or $opts->{coord3d}) {
361        no warnings 'uninitialized';
362        my @attr;
363        my $name = $mol->name;
364        $name =~ s/[\r\n]//g;
365        push @attr, 'name="' . $mol->name . '"'
366            if $opts->{name} and length $mol->name;
367        my @coords;
368        if ($opts->{coord3d} or $opts->{coords}) {
369            my @all_atoms = map {
370                (
371                    $oldmol->by_id($_),
372                    grep {$_->symbol eq 'H'}
373                        $oldmol->by_id($_)->neighbors
374                )
375            } @$id_log;
376            push @coords, sprintf("(%.3f,%.3f,%.3f)",$_->coords->array)
377                for @all_atoms;
378            push @attr, 'coord3d=' . join(',',@coords);
379        }
380        if ($opts->{attr}) {
381            push @attr, $self->format_sln_attr($mol);
382        }
383        $sln .= '<' . join(';', @attr) . '>' if @attr;
384    }
385    $sln;
386}
387
388
389sub find_ring_bonds {
390    my ($self, $mol, $opts, $atom, $from_bond, $visited, $ring_atoms) = @_;
391
392    $visited->{$atom}  = 1;
393    for my $bn (sorted_bonds_neighbors($atom, $opts)) {
394        my $nei  = $bn->{to};
395        my $bond = $bn->{bond};
396        next if $visited->{$bond};
397        $visited->{$bond}  = 1;
398        if ($visited->{$nei}) { # closed ring
399            #print "closing ring\n";
400            $ring_atoms->{$nei}++;
401        } else {
402            $self->find_ring_bonds($mol, $opts, $nei,
403                $bond, $visited, $ring_atoms);
404        }
405    }
406}
407
408sub branch {
409    my ($self, $mol, $opts, $atom, $from_bond, $visited, $digits, $id_log) = @_;
410
411    my $prev_branch = "";
412    my $sln;
413    $sln .= $self->format_bond($from_bond, $opts);
414    my $digit;
415    if ($digits->{$atom}) {  # opening a ring
416        $digit = $self->next_digit($digits);
417        $digits->{$atom} = $digit;
418    }
419    $sln .= $self->format_atom($atom, $opts, $digit);
420    push @$id_log, $atom->id;
421
422    $visited->{$atom}  = 1;
423    my @bns = sorted_bonds_neighbors($atom, $opts);
424
425    for my $bn (@bns) {
426        my $nei  = $bn->{to};
427        my $bond = $bn->{bond};
428        next if $visited->{$bond};
429        $visited->{$bond} = 1;
430        if ($visited->{$nei}) { # closed a ring
431            if ($prev_branch) {
432                $sln .= "($prev_branch)";
433            }
434            $prev_branch = $self->format_bond($bond, $opts)
435                . '@' . $digits->{$nei};
436            $visited->{$bond} = 1;
437        } else {
438            my $branch = $self->branch($mol, $opts, $nei, $bond, $visited,
439                $digits, $id_log);
440            if ($prev_branch) {
441                $sln .= "($prev_branch)";
442            }
443            $prev_branch = $branch;
444        }
445    }
446    $sln .= "$prev_branch";
447    $sln;
448}
449
450sub next_digit {
451    my ($self, $digits) = @_;
452    ++$digits->{used_digits};
453}
454
455sub collapse_hydrogens {
456    my ($self, $mol) = @_;
457
458    for my $atom (grep {$_->symbol eq 'H'} $mol->atoms) {
459        my ($neighbor) = $atom->neighbors or next;
460        $atom->delete;
461        my $h_count = $neighbor->hydrogens;
462        $h_count++;
463        $neighbor->hydrogens($h_count);
464    }
465}
466
467sub sorted_bonds_neighbors {
468    my ($atom, $opts) = @_;
469    my @bn = $atom->bonds_neighbors;
470    if ($opts->{unique}) {
471        @bn = sort {
472            $a->{to}->attr("canon/class") <=> $b->{to}->attr("canon/class")
473        } @bn;
474    }
475    @bn;
476}
477
478my %ORDER_TO_TYPE = (
479    1 => '', 2 => '=', 3 => '#', 4 => '', 0 => '.',
480);
481
482sub format_bond {
483    my ($self, $bond, $opts) = @_;
484    return '' unless $bond;
485    my $s = $bond->aromatic ? ':' : $ORDER_TO_TYPE{$bond->order};
486    my @attr;
487    @attr = $self->format_sln_attr($bond) if $opts->{attr};
488    if (@attr) {
489        $s .= '[' . join(";", @attr) . ']';
490    }
491    $s;
492}
493
494sub format_atom {
495    my ($self, $atom, $opts, $digit) = @_;
496    my $s;
497    no warnings 'uninitialized';
498    my $h_count = $atom->hydrogens;
499    my $charge  = $atom->formal_charge;
500    my $symbol  = $atom->symbol;
501
502    $charge  = $charge ? sprintf("%+d", $charge): '';
503    $h_count = $h_count ? ($h_count > 1 ? "H$h_count" : 'H') : '';
504
505    $s = $symbol;
506    my @attr;
507    @attr = $self->format_sln_attr($atom) if $opts->{attr};
508    if ($charge or $digit or @attr) {
509        $s .= '[';
510        $s .= $digit;
511        unshift @attr, $charge if $charge;
512        if (@attr) {
513            $s .= ':' if $digit;
514            $s .= join ';', @attr;
515       }
516       $s .= ']';
517    }
518    $s .= $h_count;
519    $s;
520}
521
522sub format_sln_attr {
523    my ($self, $obj) = @_;
524    my $sln_attr = $obj->attr("sln/attr") || {};
525    my @attr;
526    for my $key (sort keys %$sln_attr) {
527        my $val = $sln_attr->{$key};
528        push @attr, "$key" . ($val eq 'TRUE' ? "" : "=$val");
529    }
530    @attr;
531}
532
5331;
534
535=head1 VERSION
536
5370.11
538
539=head1 SEE ALSO
540
541L<Chemistry::Mol>, L<Chemistry::File>, L<Chemistry::File::SMILES>
542
543The PerlMol website L<http://www.perlmol.org/>
544
545Ash, S.; Cline, M. A.; Homer, R. W.; Hurst, T.; Smith, G. B., SYBYL Line
546Notation (SLN): A Versatile Language for Chemical Structure Representation. J.
547Chem. Inf. Comput. Sci; 1997; 37(1); 71-79.  DOI: 10.1021/ci960109j
548(L<http://dx.doi.org/10.1021/ci960109j>)
549
550=head1 AUTHOR
551
552Ivan Tubert-Brohman E<lt>itub@cpan.orgE<gt>
553
554=head1 COPYRIGHT
555
556Copyright (c) 2004 Ivan Tubert-Brohman. All rights reserved. This program is
557free software; you can redistribute it and/or modify it under the same terms as
558Perl itself.
559
560=cut
561
562