1 static char rcsid[] = "$Id: chrom.c 210513 2017-10-12 21:29:59Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "chrom.h"
7 #include <stdio.h>
8 #include <stdlib.h> /* For atoi */
9 #include <string.h>
10 #include "assert.h"
11 #include "mem.h"
12 #include "interval.h"
13
14
15 #ifdef DEBUG
16 #define debug(x) x
17 #else
18 #define debug(x)
19 #endif
20
21
22 /* This is the order for chrom sort */
23 typedef enum {PURE_NUMERIC, SEX_CHROMOSOME, MITOCHONDRIAL, NUMERIC_ALPHA, ALPHA_NUMERIC, PURE_ALPHA} Chromtype_T;
24
25 #ifdef DEBUG
26 static char *
Chromtype_string(Chromtype_T chromtype)27 Chromtype_string (Chromtype_T chromtype) {
28 switch (chromtype) {
29 case PURE_NUMERIC: return "numeric";
30 case SEX_CHROMOSOME: return "sex";
31 case MITOCHONDRIAL: return "mitochondrial";
32 case NUMERIC_ALPHA: return "numeric_alpha";
33 case ALPHA_NUMERIC: return "alpha_numeric";
34 case PURE_ALPHA: return "alpha";
35 default: abort();
36 }
37 }
38 #endif
39
40
41 #define T Chrom_T
42 struct T {
43 Univcoord_T order; /* order used for sorting (can be universal coordinate) */
44 bool numericp;
45 char *string; /* The original string */
46 unsigned int num; /* The initial numeric part; valid only if numericp == true */
47 char *alpha; /* The alphabetic part, possibly after the number; valid only if numericp == true */
48 Chromtype_T chromtype;
49 bool circularp;
50 Chrpos_T alt_scaffold_start;
51 Chrpos_T alt_scaffold_end;
52 };
53
54 void
Chrom_free(T * old)55 Chrom_free (T *old) {
56 FREE((*old)->alpha);
57 FREE((*old)->string);
58 FREE(*old);
59 return;
60 }
61
62
63 char *
Chrom_string(T this)64 Chrom_string (T this) {
65 return this->string;
66 }
67
68 bool
Chrom_circularp(T this)69 Chrom_circularp (T this) {
70 return this->circularp;
71 }
72
73 bool
Chrom_altlocp(Chrpos_T * alt_scaffold_start,Chrpos_T * alt_scaffold_end,T this)74 Chrom_altlocp (Chrpos_T *alt_scaffold_start, Chrpos_T *alt_scaffold_end, T this) {
75 *alt_scaffold_start = this->alt_scaffold_start;
76 *alt_scaffold_end = this->alt_scaffold_end;
77
78 if (*alt_scaffold_end != *alt_scaffold_start) {
79 return true;
80 } else {
81 return false;
82 }
83 }
84
85
86
87 /* Largest number for an unsigned int is 4294967295, which is 10
88 digits. However, the organism with the most chromosomes is the
89 Indian fern, with 1260. Hence, more than 4 digits would suggest
90 a non-chromosomal string. Also, if first digit is '0', treat as
91 a string. */
92
93 T
Chrom_from_string(char * string,char * mitochondrial_string,Univcoord_T order,bool circularp,Chrpos_T alt_scaffold_start,Chrpos_T alt_scaffold_end)94 Chrom_from_string (char *string, char *mitochondrial_string, Univcoord_T order,
95 bool circularp, Chrpos_T alt_scaffold_start, Chrpos_T alt_scaffold_end) {
96 T new = (T) MALLOC(sizeof(*new));
97 int ndigits = 0;
98 char *p;
99 bool mitochondrial_p = false, sex_p = false;
100
101 debug(printf("string is %s\n",string));
102
103 new->order = order;
104 new->circularp = circularp;
105 new->alt_scaffold_start = alt_scaffold_start;
106 new->alt_scaffold_end = alt_scaffold_end;
107
108 new->string = (char *) MALLOC((strlen(string)+1) * sizeof(char));
109 strcpy(new->string,string);
110
111 if (mitochondrial_string != NULL && !strcmp(string,mitochondrial_string)) {
112 mitochondrial_p = true;
113 }
114
115 if (!strncmp(string,"chr",3) || !strncmp(string,"Chr",3)) {
116 /* Ignore leading chr or Chr for sorting purposes */
117 string += 3;
118 debug(printf(" => chop chr to yield %s\n",string));
119 }
120
121 if (!strcmp(string,"X")) {
122 sex_p = true;
123 } else if (!strcmp(string,"Y")) {
124 sex_p = true;
125 } else if (!strcmp(string,"M")) {
126 mitochondrial_p = true;
127 } else if (!strcmp(string,"MT")) {
128 mitochondrial_p = true;
129 } else if (mitochondrial_string != NULL && !strcmp(string,mitochondrial_string)) {
130 mitochondrial_p = true;
131 }
132
133 p = string;
134 while (*p != '\0' && *p >= '0' && *p <= '9') {
135 ndigits++;
136 p++;
137 }
138 debug(printf(" => saw %d digits\n",ndigits));
139
140
141 if (ndigits > 0 && ndigits <= 4 && string[0] != '0') {
142 new->numericp = true;
143 new->num = atoi(string);
144 new->alpha = (char *) CALLOC(strlen(p)+1,sizeof(char));
145 strcpy(new->alpha,p);
146
147 if (mitochondrial_p == true) {
148 new->chromtype = MITOCHONDRIAL;
149 } else if (strlen(new->alpha) == 0) {
150 new->chromtype = PURE_NUMERIC;
151 } else {
152 new->chromtype = NUMERIC_ALPHA;
153 }
154
155 debug(printf(" => numeric with value %d and then alpha %s, type %s\n",
156 new->num,new->alpha,Chromtype_string(new->chromtype)));
157
158 } else {
159 new->numericp = false;
160 new->num = 0;
161 new->alpha = (char *) NULL;
162
163 if (mitochondrial_p == true) {
164 new->chromtype = MITOCHONDRIAL;
165 } else if (sex_p == true) {
166 new->chromtype = SEX_CHROMOSOME;
167 } else {
168 while (*p != '\0' && (*p < '1' || *p > '9')) {
169 /* Stop at initial '1' through '9'. An initial '0' must be alphabetic, not numeric. */
170 p++;
171 }
172 if (*p != '\0') {
173 new->chromtype = ALPHA_NUMERIC;
174 new->num = atoi(p);
175 new->alpha = (char *) MALLOC((p - string + 1)*sizeof(char));
176 strncpy(new->alpha,string,(p - string)*sizeof(char));
177 new->alpha[p - string] = '\0';
178 debug(printf(" => alpha_numeric with alpha %s and then num %d, type %s\n",
179 new->alpha,new->num,Chromtype_string(new->chromtype)));
180 } else {
181 new->chromtype = PURE_ALPHA;
182 debug(printf(" => alphabetical %s, type %s\n",
183 new->string,Chromtype_string(new->chromtype)));
184 }
185 }
186 }
187
188 return new;
189 }
190
191
192 int
Chrom_cmp_alpha(T a,T b)193 Chrom_cmp_alpha (T a, T b) {
194
195 return strcmp(a->string,b->string);
196 }
197
198
199 int
Chrom_cmp_numeric_alpha(T a,T b)200 Chrom_cmp_numeric_alpha (T a, T b) {
201 int cmp;
202
203 if (a->numericp == true && b->numericp == false) {
204 /* 1 and X */
205 return -1;
206 } else if (a->numericp == false && b->numericp == true) {
207 /* X and 1 */
208 return +1;
209 } else if (a->numericp == true && b->numericp == true) {
210 if (a->num < b->num) {
211 /* 1 and 2U */
212 return -1;
213 } else if (a->num > b->num) {
214 /* 2U and 1 */
215 return +1;
216 } else {
217 return strcmp(a->alpha,b->alpha);
218 }
219 } else {
220 cmp = strcmp(a->string,b->string);
221 if (cmp < 0) {
222 return -1;
223 } else if (cmp > 0) {
224 return +1;
225 } else if (a->chromtype == PURE_ALPHA && b->chromtype == ALPHA_NUMERIC) {
226 return -1;
227 } else if (a->chromtype == ALPHA_NUMERIC && b->chromtype == PURE_ALPHA) {
228 return +1;
229 } else if (a->chromtype == PURE_ALPHA && b->chromtype == PURE_ALPHA) {
230 return 0;
231 } else if (a->num < b->num) {
232 /* Chr2 and Chr10 */
233 return -1;
234 } else if (a->num > b->num) {
235 /* Chr10 and Chr2 */
236 return +1;
237 } else {
238 return 0;
239 }
240 }
241 }
242
243
244 int
Chrom_cmp_chrom(T a,T b)245 Chrom_cmp_chrom (T a, T b) {
246 int cmp;
247
248 debug(printf("Comparing %s and %s => ",a->string,b->string));
249
250 if (a->chromtype < b->chromtype) {
251 debug(printf("chromtype %d < %d => -1\n",a->chromtype,b->chromtype));
252 return -1;
253
254 } else if (b->chromtype < a->chromtype) {
255 debug(printf("chromtype %d > %d => +1\n",a->chromtype,b->chromtype));
256 return +1;
257
258 } else if (a->numericp == true && b->numericp == true) {
259 if (a->num < b->num) {
260 /* 1 and 2U */
261 debug(printf("numeric %d < %d => -1\n",a->num,b->num));
262 return -1;
263 } else if (a->num > b->num) {
264 /* 2U and 1 */
265 debug(printf("numeric %d > %d => +1\n",a->num,b->num));
266 return +1;
267 } else {
268 debug(printf("numeric_alpha %s cmp %s => %d\n",
269 a->alpha,b->alpha,strcmp(a->alpha,b->alpha)));
270 return strcmp(a->alpha,b->alpha);
271 }
272
273 } else if (a->chromtype == ALPHA_NUMERIC) {
274 cmp = strcmp(a->alpha,b->alpha);
275 if (cmp < 0) {
276 debug(printf("alpha_numeric %s cmp %s => %d\n",
277 a->alpha,b->alpha,strcmp(a->alpha,b->alpha)));
278 return -1;
279 } else if (cmp > 0) {
280 debug(printf("alpha_numeric %s cmp %s => %d\n",
281 a->alpha,b->alpha,strcmp(a->alpha,b->alpha)));
282 return +1;
283 } else if (a->num < b->num) {
284 /* Chr2 and Chr10 */
285 debug(printf("alpha_numeric %d < %d => -1\n",a->num,b->num));
286 return -1;
287 } else if (a->num > b->num) {
288 /* Chr10 and Chr2 */
289 debug(printf("alpha_numeric %d > %d => +1\n",a->num,b->num));
290 return +1;
291 } else {
292 debug(printf("alpha_numeric %d == %d => %d\n",a->num,b->num,strcmp(a->string,b->string)));
293 return strcmp(a->string,b->string);
294 }
295
296 } else {
297 /* assert(a->chromtype == PURE_ALPHA); or MITOCHONDRIAL or SEX_CHROMOSOME */
298 debug(printf("pure alpha or mitochondrial or sex chromosome %s cmp %s => %d\n",
299 a->string,b->string,strcmp(a->string,b->string)));
300 return strcmp(a->string,b->string);
301 }
302 }
303
304
305 int
Chrom_cmp_order(T a,T b)306 Chrom_cmp_order (T a, T b) {
307
308 if (a->order < b->order) {
309 return -1;
310 } else if (b->order < a->order) {
311 return +1;
312 } else {
313 return 0;
314 }
315 }
316
317
318 /* For use by qsorting an array */
319 int
Chrom_compare_order(const void * x,const void * y)320 Chrom_compare_order (const void *x, const void *y) {
321 T a = * (T *) x;
322 T b = * (T *) y;
323
324 if (a->order < b->order) {
325 return -1;
326 } else if (b->order < a->order) {
327 return +1;
328 } else {
329 return 0;
330 }
331 }
332
333 /* For use by qsorting an array */
334 int
Chrom_compare_alpha(const void * x,const void * y)335 Chrom_compare_alpha (const void *x, const void *y) {
336 T a = * (T *) x;
337 T b = * (T *) y;
338
339 return strcmp(a->string,b->string);
340 }
341
342 /* For use by qsorting an array */
343 int
Chrom_compare_numeric_alpha(const void * x,const void * y)344 Chrom_compare_numeric_alpha (const void *x, const void *y) {
345 T a = * (T *) x;
346 T b = * (T *) y;
347
348 return Chrom_cmp_numeric_alpha(a,b);
349 }
350
351 /* For use by qsorting an array */
352 int
Chrom_compare_chrom(const void * x,const void * y)353 Chrom_compare_chrom (const void *x, const void *y) {
354 T a = * (T *) x;
355 T b = * (T *) y;
356
357 return Chrom_cmp_chrom(a,b);
358 }
359
360
361 /* For use in table comparisons */
362 int
Chrom_compare_table(const void * x,const void * y)363 Chrom_compare_table (const void *x, const void *y) {
364 T a = (T) x;
365 T b = (T) y;
366
367 return Chrom_cmp_chrom(a,b);
368 }
369
370 /* This is the X31 hash function */
371 static unsigned int
string_hash(char * a)372 string_hash (char *a) {
373 unsigned int h = 0U;
374 char *p;
375
376 for (p = a; *p != '\0'; p++) {
377 h = (h << 5) - h + *p;
378 }
379 return h;
380 }
381
382 unsigned int
Chrom_hash_table(const void * key)383 Chrom_hash_table (const void *key) {
384 T this = (T) key;
385
386 return string_hash(this->string);
387 }
388