1 #include "samtools.pysam.h"
2 
3 /*  dict.c -- create a sequence dictionary file.
4 
5     Copyright (C) 2015, 2020 Genome Research Ltd.
6 
7     Author: Shane McCarthy <sm15@sanger.ac.uk>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE.  */
26 
27 #include <config.h>
28 
29 #include <stdio.h>
30 #include <string.h>
31 #include <unistd.h>
32 #include <zlib.h>
33 #include <getopt.h>
34 #include "htslib/kseq.h"
35 #include "htslib/hts.h"
36 
37 KSEQ_INIT(gzFile, gzread)
38 
39 typedef struct _args_t
40 {
41     char *output_fname, *fname;
42     char *assembly, *species, *uri;
43     int  alias, header;
44 }
45 args_t;
46 
write_dict(const char * fn,args_t * args)47 static void write_dict(const char *fn, args_t *args)
48 {
49     hts_md5_context *md5;
50     int l, i, k;
51     gzFile fp;
52     kseq_t *seq;
53     unsigned char digest[16];
54     char hex[33];
55 
56     fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
57     if (fp == 0) {
58         fprintf(samtools_stderr, "dict: %s: No such file or directory\n", fn);
59         samtools_exit(1);
60     }
61     FILE *out = samtools_stdout;
62     if (args->output_fname) {
63         out = fopen(args->output_fname, "w");
64         if (out == NULL) {
65           fprintf(samtools_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname);
66           samtools_exit(1);
67         }
68     }
69 
70     if (!(md5 = hts_md5_init()))
71         samtools_exit(1);
72 
73     seq = kseq_init(fp);
74     if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n");
75     while ((l = kseq_read(seq)) >= 0) {
76         for (i = k = 0; i < seq->seq.l; ++i) {
77             if (seq->seq.s[i] >= '!' && seq->seq.s[i] <= '~')
78                 seq->seq.s[k++] = toupper(seq->seq.s[i]);
79         }
80         hts_md5_reset(md5);
81         hts_md5_update(md5, (unsigned char*)seq->seq.s, k);
82         hts_md5_final(digest, md5);
83         hts_md5_hex(hex, digest);
84         fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
85         if (args->alias) {
86             const char *name = seq->name.s;
87             if (strncmp(name, "chr", 3) == 0) {
88                 name += 3;
89                 fprintf(out, "\tAN:%s", name);
90             }
91             else
92                 fprintf(out, "\tAN:chr%s", name);
93 
94             if (strcmp(name, "M") == 0)
95                 fprintf(out, ",chrMT,MT");
96             else if (strcmp(name, "MT") == 0)
97                 fprintf(out, ",chrM,M");
98         }
99         if (args->uri)
100             fprintf(out, "\tUR:%s", args->uri);
101         else if (strcmp(fn, "-") != 0) {
102 #ifdef _WIN32
103             char *real_path = _fullpath(NULL, fn, PATH_MAX);
104 #else
105             char *real_path = realpath(fn, NULL);
106 #endif
107             fprintf(out, "\tUR:file://%s", real_path);
108             free(real_path);
109         }
110         if (args->assembly) fprintf(out, "\tAS:%s", args->assembly);
111         if (args->species) fprintf(out, "\tSP:%s", args->species);
112         fprintf(out, "\n");
113     }
114     kseq_destroy(seq);
115     hts_md5_destroy(md5);
116 
117     if (args->output_fname) fclose(out);
118     gzclose(fp);
119 }
120 
dict_usage(void)121 static int dict_usage(void)
122 {
123     fprintf(samtools_stderr, "\n");
124     fprintf(samtools_stderr, "About:   Create a sequence dictionary file from a fasta file\n");
125     fprintf(samtools_stderr, "Usage:   samtools dict [options] <file.fa|file.fa.gz>\n\n");
126     fprintf(samtools_stderr, "Options: -a, --assembly STR    assembly\n");
127     fprintf(samtools_stderr, "         -A, --alias, --alternative-name\n");
128     fprintf(samtools_stderr, "                               add AN tag by adding/removing 'chr'\n");
129     fprintf(samtools_stderr, "         -H, --no-header       do not print @HD line\n");
130     fprintf(samtools_stderr, "         -o, --output FILE     file to write out dict file [samtools_stdout]\n");
131     fprintf(samtools_stderr, "         -s, --species STR     species\n");
132     fprintf(samtools_stderr, "         -u, --uri STR         URI [file:///abs/path/to/file.fa]\n");
133     fprintf(samtools_stderr, "\n");
134     return 1;
135 }
136 
dict_main(int argc,char * argv[])137 int dict_main(int argc, char *argv[])
138 {
139     args_t *args = (args_t*) calloc(1,sizeof(args_t));
140     args->header = 1;
141 
142     static const struct option loptions[] =
143     {
144         {"help", no_argument, NULL, 'h'},
145         {"no-header", no_argument, NULL, 'H'},
146         {"alias", no_argument, NULL, 'A'},
147         {"alternative-name", no_argument, NULL, 'A'},
148         {"assembly", required_argument, NULL, 'a'},
149         {"species", required_argument, NULL, 's'},
150         {"uri", required_argument, NULL, 'u'},
151         {"output", required_argument, NULL, 'o'},
152         {NULL, 0, NULL, 0}
153     };
154     int c;
155     while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 )
156     {
157         switch (c)
158         {
159             case 'A': args->alias = 1; break;
160             case 'a': args->assembly = optarg; break;
161             case 's': args->species = optarg; break;
162             case 'u': args->uri = optarg; break;
163             case 'o': args->output_fname = optarg; break;
164             case 'H': args->header = 0; break;
165             case 'h': return dict_usage();
166             default: return dict_usage();
167         }
168     }
169 
170     char *fname = NULL;
171     if ( optind>=argc )
172     {
173         if ( !isatty(STDIN_FILENO) ) fname = "-";  // reading from stdin
174         else return dict_usage();
175     }
176     else fname = argv[optind];
177 
178     write_dict(fname, args);
179     free(args);
180     return 0;
181 }
182