1#!/usr/local/bin/perl
2
3# The MIT License
4
5# Copyright (c) 2014, 2020 Genome Research Ltd.
6# Author: Rob Davies <rmd+sam@sanger.ac.uk>
7
8# Permission is hereby granted, free of charge, to any person obtaining a copy
9# of this software and associated documentation files (the "Software"), to deal
10# in the Software without restriction, including without limitation the rights
11# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12# copies of the Software, and to permit persons to whom the Software is
13# furnished to do so, subject to the following conditions:
14
15# The above copyright notice and this permission notice shall be included in
16# all copies or substantial portions of the Software.
17
18# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24# THE SOFTWARE.
25
26# Import references into a cram reference cache from fasta files.
27# See below __END__ for POD documentation.
28
29use strict;
30use warnings;
31use Digest::MD5;
32use Getopt::Long;
33use File::Find;
34use File::Temp qw(tempfile);
35use File::Spec::Functions;
36use File::Path 'make_path';
37use IO::Handle;
38
39$| = 1;
40
41# Directory where the cache will be built
42my $root_dir;
43
44# Number of subdirectories to make below $root_dir
45# Each subdir will eat up two hex digits of the file MD5
46my $subdirs = 2;
47
48# Directory tree to search when using the -find option
49my $find = '';
50
51# How much data to read before spilling to a file
52my $max_acc = 256 * 1024 * 1024;
53
54my $usage = "Usage: $0 -root <dir> [-subdirs <n>] input1.fasta ...\n       $0 -root <dir> [-subdirs <n>] -find <dir>\n";
55
56# Deal with options
57GetOptions("root=s" => \$root_dir, "subdirs=s" => \$subdirs,
58       "find=s" => \$find) || die $usage;
59
60unless ($root_dir && $subdirs =~ /^\d+$/) { die $usage; }
61if ($subdirs >= 16) {
62    die "$0: Error: -subdirs should be less than 15.\n";
63}
64
65# Regexp to convert a hex MD5 to a list of $subdirs subdirectory names, the
66# remainder making the filename in the leaf directory
67my $dest_regexp = "(..)" x $subdirs . "(" . ".." x (16 - $subdirs) . ")";
68my $dest_re = qr/$dest_regexp/;
69
70# Ensure $root_dir exists
71unless (-e $root_dir) {
72    make_path($root_dir);
73}
74
75if ($find) {
76    # Find mode - search a directory tree for anything that looks like a
77    # fasta file.  Any that are found will be put into the new cache, if
78    # they are not already there.
79    find({
80    wanted => sub {
81        find_files($File::Find::name, $root_dir, $dest_re, $max_acc);
82    },
83    no_chdir => 1,
84     },
85     $find);
86} elsif (@ARGV) {
87    # If a list of files was given on the command line, go through them
88    # and try to add each one.
89    foreach my $name (@ARGV) {
90    open(my $fh, '<', $name) || die "Couldn't open $name: $!\n";
91    process_file($name, $fh, $root_dir, $dest_re, $max_acc);
92    close($fh) || die "Error closing $name: $!\n";
93    }
94} else {
95    # Otherwise read from STDIN
96    process_file('STDIN', \*STDIN, $root_dir, $dest_re, $max_acc);
97}
98
99print "\n";
100print "Use environment REF_CACHE=$root_dir" . "/%2s" x $subdirs .
101    "/%s for accessing these files.\n";
102print "See also https://www.htslib.org/workflow/#the-ref_path-and-ref_cache for\nfurther information.\n";
103exit;
104
105sub find_files {
106    my ($name, $root_dir, $dest_re, $max_acc) = @_;
107
108    # See if $name is a candidate file
109
110    my $fh;
111    return if ($name =~ /~$/);        # Ignore backup files
112    return unless (-f $name && -r _); # Ignore non-regular and unreadable files
113
114    # Inspect the first two lines of the candidate
115    my $buffer;
116    open($fh, '<', $name) || die "Couldn't open $name: $!\n";
117    read($fh, $buffer, 8192); # Should be enough to find the header & sequence
118    close($fh) || die "Error closing $name: $!\n";
119    my ($l1, $l2) = split(/\n/, $buffer);
120
121    # Check for fasta-like content
122    return unless ($l1 && $l1 =~ /^>\S+/);
123    return unless ($l2 && $l2 =~ /^[ACGTMRWSYKVHDBNacgtmrwsykvhdbn]+$/);
124
125    # Looks like a fasta file, so process it
126    open($fh, '<', $name) || die "Couldn't open $name: $!\n";
127    process_file($name, $fh, $root_dir, $dest_re, $max_acc);
128    close($fh) || die "Error closing $name: $!\n";
129}
130
131sub process_file {
132    my ($name, $in_fh, $root_dir, $dest_re, $max_acc) = @_;
133
134    # Process the fasta file $in_fh.  Each entry in the file is read, and
135    # the MD5 calculated as described in the SAM specification (i.e.
136    # all uppercased with whitespace stripped out).  The MD5 is then
137    # split using $dest_re to convert it into the path name for the entry
138    # in the cache.  If the path is not already present, the entry (in
139    # uppercased and stripped form) is saved into the cache.
140
141    # Entries shorter that $max_acc will be kept in memory.  For fasta files
142    # with lots of short entries this can save a lot of unnecessary writing
143    # if the data is already in the cache.  Anything longer
144    # gets written out to a file to keep memory consumption under control.
145    # The temporary files have to be made in $root_dir, as the final
146    # destination is not known until the entry has been completely read.
147
148    my $id;        # Name of current fasta entry
149    my $ctx;       # MD5 context
150    my $acc = '';  # The accumulated sequence
151    my $tmpfile;   # Temporary file name
152    my $tmpfh;     # Temporary file handle
153    my $extra = 1024; # Extra space to pre-allocate to account for reading
154                      # 1 line past $max_acc
155    vec($acc, $max_acc + $extra, 8) = 1; # Pre-allocate some space
156    $acc = '';
157
158    # Use an eval block so any stray temporary file can be cleaned up before
159    # exiting.
160    eval {
161    print "Reading $name ...\n";
162    for (;;) { # Error catching form of while (<>) {...}
163        undef($!);
164        last if (eof($in_fh)); # Needed if last line isn't terminated
165        unless (defined($_ = readline($in_fh))) {
166        die "Error reading $name: $!" if $!;
167        last; # EOF
168        }
169
170        if (/^>(\S+)/) {
171        # Found a fasta header
172        if ($ctx) { # Finish previous entry, if there is one
173            finish_entry($id, $ctx, \$acc, $tmpfh, $tmpfile,
174                 $root_dir, $dest_re);
175            undef($tmpfile);
176            $acc = '';
177        }
178        $id = $1;
179        $ctx = Digest::MD5->new();
180        } else {
181        unless ($id) { die "Found sequence with no header\n"; }
182        # Read some sequence
183        chomp;
184        s/\s+//g;
185        if ($_) {
186            $_ = uc($_);
187            $acc .= $_;
188            $ctx->add($_);
189
190            if (length($acc) > $max_acc) {
191            # Spill long sequences out to a temporary file in
192            # $root_dir.
193            unless ($tmpfile) {
194                ($tmpfh, $tmpfile) = tempfile(DIR => $root_dir,
195                              SUFFIX => '.tmp');
196            }
197            print $tmpfh $acc
198                || die "Error writing to $tmpfile: $!\n";
199            $acc = '';
200            }
201        }
202        }
203    }
204    if ($ctx) {
205        # Finish off the last entry
206        finish_entry($id, $ctx, \$acc, $tmpfh, $tmpfile,
207             $root_dir, $dest_re);
208        undef($tmpfile);
209    }
210    };
211    my $err = $@;
212    if ($tmpfile) { unlink($tmpfile); }
213    if ($err) { die $err; }
214}
215
216sub finish_entry {
217    my ($id, $ctx, $acc_ref, $tmpfh, $tmpfile, $root_dir, $dest_re) = @_;
218
219    # Finish writing an entry
220
221    my $digest = $ctx->hexdigest;
222
223    # Get the destination directory and filename
224    my @segs = $digest =~ /$dest_re/;
225    my $dest_dir = (@segs > 1
226            ? catdir($root_dir, @segs[0..($#segs - 1)])
227            : $root_dir);
228    my $dest = catfile($dest_dir, $segs[-1]);
229
230    # Make the destination dir if necessary
231    unless (-e $dest_dir) {
232    make_path($dest_dir);
233    }
234
235    if (-e $dest) {
236    # If the file is already present, there's nothing to do apart from
237    # remove the temporary file if it was made.
238    print "Already exists: $digest $id\n";
239    if ($tmpfile) {
240        close($tmpfh) || die "Error closing $tmpfile: $!\n";
241        unlink($tmpfile) || die "Couldn't remove $tmpfile: $!\n";
242    }
243    } else {
244    # Need to add the data to the cache.
245    unless ($tmpfile) {
246        # If the data hasn't been written already, it needs to be done
247        # now.  Write to a temp file in $dest_dir so if it goes wrong
248        # we won't leave a file with the right name but half-written
249        # content.
250        ($tmpfh, $tmpfile) = tempfile(DIR => $dest_dir,
251                      SUFFIX => '.tmp');
252    }
253
254    # Assert that the $tmpfile is now set
255    unless ($tmpfile) { die "Error: Didn't make a temp file"; }
256
257    eval {
258        # Flush out any remaining data
259        if ($$acc_ref) {
260        print $tmpfh $$acc_ref || die "Error writing to $tmpfile: $!\n";
261        }
262        # Paranoid file close
263        $tmpfh->flush() || die "Error flushing to $tmpfile: $!\n";
264        $tmpfh->sync() || die "Error syncing $tmpfile: $!\n";
265        close($tmpfh) || die "Error writing to $tmpfile: $!\n";
266    };
267    if ($@) {
268        # Attempt to clean up if writing failed
269        my $save = $@;
270        unlink($tmpfile) || warn "Couldn't remove $tmpfile: $!";
271        die $save;
272    }
273
274    # Finished writing, and everything is flushed as far as possible
275    # so rename the temp file
276    print "$dest $id\n";
277    rename($tmpfile, $dest)
278        || die "Error moving $tmpfile to $dest: $!\n";
279    }
280}
281
282__END__
283
284=head1 NAME
285
286seq_cache_populate.pl
287
288=head1 SYNOPSIS
289
290 seq_cache_populate.pl -root <dir> [-subdirs <n>] input1.fasta ...
291
292 seq_cache_populate.pl -root <dir> [-subdirs <n>]  -find <dir>
293
294=head1 DESCRIPTION
295
296Import references into a cram reference cache from fasta files.
297
298When run with a list of fasta files, this program reads the files and stores
299the sequences within it in the reference cache under directory <dir>.  The
300sequences in the cache are stored with names based on the MD5 checksum
301of the sequence.
302
303By default, sequences are stored in a hierarchy two directories deep, to
304keep the number of items in a single directory to a reasonable number.  This
305depth can be changed using the -subdirs option.
306
307If the -find option is used, the program will scan the given directory tree.
308Any files that appear to be fasta (by looking for a first line starting with
309'>' followed by something that looks like DNA sequence) will be read and
310added to the reference cache.  The traversal will ignore symbolic links.
311
312Samtools/htslib can be made to use the cache by appropriate setting of the
313REF_PATH environment variable.  For example, if seq_cache_populate was run
314using options '-root /tmp/ref_cache -subdirs 2', setting REF_PATH to
315'/tmp/ref_cache/%2s/%2s/%s' should allow samtools to find the references that
316it stored.
317
318Note that if no REF_PATH is specified, htslib will default to downloading from
319the EBI reference server and caching locally (see the samtools(1) man page for
320details), defaulting to $HOME/.cache/hts-ref/%2s/%2s/%s.  This is functionally
321equivalent to running this tool with '-root $HOME/.cache/hts-ref -subdirs 2'.
322
323=head1 AUTHOR
324
325Rob Davies.
326
327=cut
328