1 #include "samtools.pysam.h"
2
3 /* dict.c -- create a sequence dictionary file.
4
5 Copyright (C) 2015, 2020 Genome Research Ltd.
6
7 Author: Shane McCarthy <sm15@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE. */
26
27 #include <config.h>
28
29 #include <stdio.h>
30 #include <string.h>
31 #include <unistd.h>
32 #include <zlib.h>
33 #include <getopt.h>
34 #include "htslib/kseq.h"
35 #include "htslib/hts.h"
36
37 KSEQ_INIT(gzFile, gzread)
38
39 typedef struct _args_t
40 {
41 char *output_fname, *fname;
42 char *assembly, *species, *uri;
43 int alias, header;
44 }
45 args_t;
46
write_dict(const char * fn,args_t * args)47 static void write_dict(const char *fn, args_t *args)
48 {
49 hts_md5_context *md5;
50 int l, i, k;
51 gzFile fp;
52 kseq_t *seq;
53 unsigned char digest[16];
54 char hex[33];
55
56 fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
57 if (fp == 0) {
58 fprintf(samtools_stderr, "dict: %s: No such file or directory\n", fn);
59 samtools_exit(1);
60 }
61 FILE *out = samtools_stdout;
62 if (args->output_fname) {
63 out = fopen(args->output_fname, "w");
64 if (out == NULL) {
65 fprintf(samtools_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname);
66 samtools_exit(1);
67 }
68 }
69
70 if (!(md5 = hts_md5_init()))
71 samtools_exit(1);
72
73 seq = kseq_init(fp);
74 if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n");
75 while ((l = kseq_read(seq)) >= 0) {
76 for (i = k = 0; i < seq->seq.l; ++i) {
77 if (seq->seq.s[i] >= '!' && seq->seq.s[i] <= '~')
78 seq->seq.s[k++] = toupper(seq->seq.s[i]);
79 }
80 hts_md5_reset(md5);
81 hts_md5_update(md5, (unsigned char*)seq->seq.s, k);
82 hts_md5_final(digest, md5);
83 hts_md5_hex(hex, digest);
84 fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
85 if (args->alias) {
86 const char *name = seq->name.s;
87 if (strncmp(name, "chr", 3) == 0) {
88 name += 3;
89 fprintf(out, "\tAN:%s", name);
90 }
91 else
92 fprintf(out, "\tAN:chr%s", name);
93
94 if (strcmp(name, "M") == 0)
95 fprintf(out, ",chrMT,MT");
96 else if (strcmp(name, "MT") == 0)
97 fprintf(out, ",chrM,M");
98 }
99 if (args->uri)
100 fprintf(out, "\tUR:%s", args->uri);
101 else if (strcmp(fn, "-") != 0) {
102 #ifdef _WIN32
103 char *real_path = _fullpath(NULL, fn, PATH_MAX);
104 #else
105 char *real_path = realpath(fn, NULL);
106 #endif
107 fprintf(out, "\tUR:file://%s", real_path);
108 free(real_path);
109 }
110 if (args->assembly) fprintf(out, "\tAS:%s", args->assembly);
111 if (args->species) fprintf(out, "\tSP:%s", args->species);
112 fprintf(out, "\n");
113 }
114 kseq_destroy(seq);
115 hts_md5_destroy(md5);
116
117 if (args->output_fname) fclose(out);
118 gzclose(fp);
119 }
120
dict_usage(void)121 static int dict_usage(void)
122 {
123 fprintf(samtools_stderr, "\n");
124 fprintf(samtools_stderr, "About: Create a sequence dictionary file from a fasta file\n");
125 fprintf(samtools_stderr, "Usage: samtools dict [options] <file.fa|file.fa.gz>\n\n");
126 fprintf(samtools_stderr, "Options: -a, --assembly STR assembly\n");
127 fprintf(samtools_stderr, " -A, --alias, --alternative-name\n");
128 fprintf(samtools_stderr, " add AN tag by adding/removing 'chr'\n");
129 fprintf(samtools_stderr, " -H, --no-header do not print @HD line\n");
130 fprintf(samtools_stderr, " -o, --output FILE file to write out dict file [samtools_stdout]\n");
131 fprintf(samtools_stderr, " -s, --species STR species\n");
132 fprintf(samtools_stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n");
133 fprintf(samtools_stderr, "\n");
134 return 1;
135 }
136
dict_main(int argc,char * argv[])137 int dict_main(int argc, char *argv[])
138 {
139 args_t *args = (args_t*) calloc(1,sizeof(args_t));
140 args->header = 1;
141
142 static const struct option loptions[] =
143 {
144 {"help", no_argument, NULL, 'h'},
145 {"no-header", no_argument, NULL, 'H'},
146 {"alias", no_argument, NULL, 'A'},
147 {"alternative-name", no_argument, NULL, 'A'},
148 {"assembly", required_argument, NULL, 'a'},
149 {"species", required_argument, NULL, 's'},
150 {"uri", required_argument, NULL, 'u'},
151 {"output", required_argument, NULL, 'o'},
152 {NULL, 0, NULL, 0}
153 };
154 int c;
155 while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 )
156 {
157 switch (c)
158 {
159 case 'A': args->alias = 1; break;
160 case 'a': args->assembly = optarg; break;
161 case 's': args->species = optarg; break;
162 case 'u': args->uri = optarg; break;
163 case 'o': args->output_fname = optarg; break;
164 case 'H': args->header = 0; break;
165 case 'h': return dict_usage();
166 default: return dict_usage();
167 }
168 }
169
170 char *fname = NULL;
171 if ( optind>=argc )
172 {
173 if ( !isatty(STDIN_FILENO) ) fname = "-"; // reading from stdin
174 else return dict_usage();
175 }
176 else fname = argv[optind];
177
178 write_dict(fname, args);
179 free(args);
180 return 0;
181 }
182