1 static char rcsid[] = "$Id: chrom.c 210513 2017-10-12 21:29:59Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "chrom.h"
7 #include <stdio.h>
8 #include <stdlib.h>		/* For atoi */
9 #include <string.h>
10 #include "assert.h"
11 #include "mem.h"
12 #include "interval.h"
13 
14 
15 #ifdef DEBUG
16 #define debug(x) x
17 #else
18 #define debug(x)
19 #endif
20 
21 
22 /* This is the order for chrom sort */
23 typedef enum {PURE_NUMERIC, SEX_CHROMOSOME, MITOCHONDRIAL, NUMERIC_ALPHA, ALPHA_NUMERIC, PURE_ALPHA} Chromtype_T;
24 
25 #ifdef DEBUG
26 static char *
Chromtype_string(Chromtype_T chromtype)27 Chromtype_string (Chromtype_T chromtype) {
28   switch (chromtype) {
29   case PURE_NUMERIC: return "numeric";
30   case SEX_CHROMOSOME: return "sex";
31   case MITOCHONDRIAL: return "mitochondrial";
32   case NUMERIC_ALPHA: return "numeric_alpha";
33   case ALPHA_NUMERIC: return "alpha_numeric";
34   case PURE_ALPHA: return "alpha";
35   default: abort();
36   }
37 }
38 #endif
39 
40 
41 #define T Chrom_T
42 struct T {
43   Univcoord_T order;            /* order used for sorting (can be universal coordinate) */
44   bool numericp;
45   char *string;			/* The original string */
46   unsigned int num;		/* The initial numeric part; valid only if numericp == true */
47   char *alpha;			/* The alphabetic part, possibly after the number; valid only if numericp == true */
48   Chromtype_T chromtype;
49   bool circularp;
50   Chrpos_T alt_scaffold_start;
51   Chrpos_T alt_scaffold_end;
52 };
53 
54 void
Chrom_free(T * old)55 Chrom_free (T *old) {
56   FREE((*old)->alpha);
57   FREE((*old)->string);
58   FREE(*old);
59   return;
60 }
61 
62 
63 char *
Chrom_string(T this)64 Chrom_string (T this) {
65   return this->string;
66 }
67 
68 bool
Chrom_circularp(T this)69 Chrom_circularp (T this) {
70   return this->circularp;
71 }
72 
73 bool
Chrom_altlocp(Chrpos_T * alt_scaffold_start,Chrpos_T * alt_scaffold_end,T this)74 Chrom_altlocp (Chrpos_T *alt_scaffold_start, Chrpos_T *alt_scaffold_end, T this) {
75   *alt_scaffold_start = this->alt_scaffold_start;
76   *alt_scaffold_end = this->alt_scaffold_end;
77 
78   if (*alt_scaffold_end != *alt_scaffold_start) {
79     return true;
80   } else {
81     return false;
82   }
83 }
84 
85 
86 
87 /* Largest number for an unsigned int is 4294967295, which is 10
88    digits.  However, the organism with the most chromosomes is the
89    Indian fern, with 1260.  Hence, more than 4 digits would suggest
90    a non-chromosomal string.  Also, if first digit is '0', treat as
91    a string. */
92 
93 T
Chrom_from_string(char * string,char * mitochondrial_string,Univcoord_T order,bool circularp,Chrpos_T alt_scaffold_start,Chrpos_T alt_scaffold_end)94 Chrom_from_string (char *string, char *mitochondrial_string, Univcoord_T order,
95 		   bool circularp, Chrpos_T alt_scaffold_start, Chrpos_T alt_scaffold_end) {
96   T new = (T) MALLOC(sizeof(*new));
97   int ndigits = 0;
98   char *p;
99   bool mitochondrial_p = false, sex_p = false;
100 
101   debug(printf("string is %s\n",string));
102 
103   new->order = order;
104   new->circularp = circularp;
105   new->alt_scaffold_start = alt_scaffold_start;
106   new->alt_scaffold_end = alt_scaffold_end;
107 
108   new->string = (char *) MALLOC((strlen(string)+1) * sizeof(char));
109   strcpy(new->string,string);
110 
111   if (mitochondrial_string != NULL && !strcmp(string,mitochondrial_string)) {
112     mitochondrial_p = true;
113   }
114 
115   if (!strncmp(string,"chr",3) || !strncmp(string,"Chr",3)) {
116     /* Ignore leading chr or Chr for sorting purposes */
117     string += 3;
118     debug(printf("  => chop chr to yield %s\n",string));
119   }
120 
121   if (!strcmp(string,"X")) {
122     sex_p = true;
123   } else if (!strcmp(string,"Y")) {
124     sex_p = true;
125   } else if (!strcmp(string,"M")) {
126     mitochondrial_p = true;
127   } else if (!strcmp(string,"MT")) {
128     mitochondrial_p = true;
129   } else if (mitochondrial_string != NULL && !strcmp(string,mitochondrial_string)) {
130     mitochondrial_p = true;
131   }
132 
133   p = string;
134   while (*p != '\0' && *p >= '0' && *p <= '9') {
135     ndigits++;
136     p++;
137   }
138   debug(printf("  => saw %d digits\n",ndigits));
139 
140 
141   if (ndigits > 0 && ndigits <= 4 && string[0] != '0') {
142     new->numericp = true;
143     new->num = atoi(string);
144     new->alpha = (char *) CALLOC(strlen(p)+1,sizeof(char));
145     strcpy(new->alpha,p);
146 
147     if (mitochondrial_p == true) {
148       new->chromtype = MITOCHONDRIAL;
149     } else if (strlen(new->alpha) == 0) {
150       new->chromtype = PURE_NUMERIC;
151     } else {
152       new->chromtype = NUMERIC_ALPHA;
153     }
154 
155     debug(printf("  => numeric with value %d and then alpha %s, type %s\n",
156 		 new->num,new->alpha,Chromtype_string(new->chromtype)));
157 
158   } else {
159     new->numericp = false;
160     new->num = 0;
161     new->alpha = (char *) NULL;
162 
163     if (mitochondrial_p == true) {
164       new->chromtype = MITOCHONDRIAL;
165     } else if (sex_p == true) {
166       new->chromtype = SEX_CHROMOSOME;
167     } else {
168       while (*p != '\0' && (*p < '1' || *p > '9')) {
169 	/* Stop at initial '1' through '9'.  An initial '0' must be alphabetic, not numeric. */
170 	p++;
171       }
172       if (*p != '\0') {
173 	new->chromtype = ALPHA_NUMERIC;
174 	new->num = atoi(p);
175 	new->alpha = (char *) MALLOC((p - string + 1)*sizeof(char));
176 	strncpy(new->alpha,string,(p - string)*sizeof(char));
177 	new->alpha[p - string] = '\0';
178 	debug(printf("  => alpha_numeric with alpha %s and then num %d, type %s\n",
179 		     new->alpha,new->num,Chromtype_string(new->chromtype)));
180       } else {
181 	new->chromtype = PURE_ALPHA;
182 	debug(printf("  => alphabetical %s, type %s\n",
183 		     new->string,Chromtype_string(new->chromtype)));
184       }
185     }
186   }
187 
188   return new;
189 }
190 
191 
192 int
Chrom_cmp_alpha(T a,T b)193 Chrom_cmp_alpha (T a, T b) {
194 
195   return strcmp(a->string,b->string);
196 }
197 
198 
199 int
Chrom_cmp_numeric_alpha(T a,T b)200 Chrom_cmp_numeric_alpha (T a, T b) {
201   int cmp;
202 
203   if (a->numericp == true && b->numericp == false) {
204     /* 1 and X */
205     return -1;
206   } else if (a->numericp == false && b->numericp == true) {
207     /* X and 1 */
208     return +1;
209   } else if (a->numericp == true && b->numericp == true) {
210     if (a->num < b->num) {
211       /* 1 and 2U */
212       return -1;
213     } else if (a->num > b->num) {
214       /* 2U and 1 */
215       return +1;
216     } else {
217       return strcmp(a->alpha,b->alpha);
218     }
219   } else {
220     cmp = strcmp(a->string,b->string);
221     if (cmp < 0) {
222       return -1;
223     } else if (cmp > 0) {
224       return +1;
225     } else if (a->chromtype == PURE_ALPHA && b->chromtype == ALPHA_NUMERIC) {
226       return -1;
227     } else if (a->chromtype == ALPHA_NUMERIC && b->chromtype == PURE_ALPHA) {
228       return +1;
229     } else if (a->chromtype == PURE_ALPHA && b->chromtype == PURE_ALPHA) {
230       return 0;
231     } else if (a->num < b->num) {
232       /* Chr2 and Chr10 */
233       return -1;
234     } else if (a->num > b->num) {
235       /* Chr10 and Chr2 */
236       return +1;
237     } else {
238       return 0;
239     }
240   }
241 }
242 
243 
244 int
Chrom_cmp_chrom(T a,T b)245 Chrom_cmp_chrom (T a, T b) {
246   int cmp;
247 
248   debug(printf("Comparing %s and %s => ",a->string,b->string));
249 
250   if (a->chromtype < b->chromtype) {
251     debug(printf("chromtype %d < %d => -1\n",a->chromtype,b->chromtype));
252     return -1;
253 
254   } else if (b->chromtype < a->chromtype) {
255     debug(printf("chromtype %d > %d => +1\n",a->chromtype,b->chromtype));
256     return +1;
257 
258   } else if (a->numericp == true && b->numericp == true) {
259     if (a->num < b->num) {
260       /* 1 and 2U */
261       debug(printf("numeric %d < %d => -1\n",a->num,b->num));
262       return -1;
263     } else if (a->num > b->num) {
264       /* 2U and 1 */
265       debug(printf("numeric %d > %d => +1\n",a->num,b->num));
266       return +1;
267     } else {
268       debug(printf("numeric_alpha %s cmp %s => %d\n",
269 		   a->alpha,b->alpha,strcmp(a->alpha,b->alpha)));
270       return strcmp(a->alpha,b->alpha);
271     }
272 
273   } else if (a->chromtype == ALPHA_NUMERIC) {
274     cmp = strcmp(a->alpha,b->alpha);
275     if (cmp < 0) {
276       debug(printf("alpha_numeric %s cmp %s => %d\n",
277 		   a->alpha,b->alpha,strcmp(a->alpha,b->alpha)));
278       return -1;
279     } else if (cmp > 0) {
280       debug(printf("alpha_numeric %s cmp %s => %d\n",
281 		   a->alpha,b->alpha,strcmp(a->alpha,b->alpha)));
282       return +1;
283     } else if (a->num < b->num) {
284       /* Chr2 and Chr10 */
285       debug(printf("alpha_numeric %d < %d => -1\n",a->num,b->num));
286       return -1;
287     } else if (a->num > b->num) {
288       /* Chr10 and Chr2 */
289       debug(printf("alpha_numeric %d > %d => +1\n",a->num,b->num));
290       return +1;
291     } else {
292       debug(printf("alpha_numeric %d == %d => %d\n",a->num,b->num,strcmp(a->string,b->string)));
293       return strcmp(a->string,b->string);
294     }
295 
296   } else {
297     /* assert(a->chromtype == PURE_ALPHA); or MITOCHONDRIAL or SEX_CHROMOSOME */
298     debug(printf("pure alpha or mitochondrial or sex chromosome %s cmp %s => %d\n",
299 		 a->string,b->string,strcmp(a->string,b->string)));
300     return strcmp(a->string,b->string);
301   }
302 }
303 
304 
305 int
Chrom_cmp_order(T a,T b)306 Chrom_cmp_order (T a, T b) {
307 
308   if (a->order < b->order) {
309     return -1;
310   } else if (b->order < a->order) {
311     return +1;
312   } else {
313     return 0;
314   }
315 }
316 
317 
318 /* For use by qsorting an array */
319 int
Chrom_compare_order(const void * x,const void * y)320 Chrom_compare_order (const void *x, const void *y) {
321   T a = * (T *) x;
322   T b = * (T *) y;
323 
324   if (a->order < b->order) {
325     return -1;
326   } else if (b->order < a->order) {
327     return +1;
328   } else {
329     return 0;
330   }
331 }
332 
333 /* For use by qsorting an array */
334 int
Chrom_compare_alpha(const void * x,const void * y)335 Chrom_compare_alpha (const void *x, const void *y) {
336   T a = * (T *) x;
337   T b = * (T *) y;
338 
339   return strcmp(a->string,b->string);
340 }
341 
342 /* For use by qsorting an array */
343 int
Chrom_compare_numeric_alpha(const void * x,const void * y)344 Chrom_compare_numeric_alpha (const void *x, const void *y) {
345   T a = * (T *) x;
346   T b = * (T *) y;
347 
348   return Chrom_cmp_numeric_alpha(a,b);
349 }
350 
351 /* For use by qsorting an array */
352 int
Chrom_compare_chrom(const void * x,const void * y)353 Chrom_compare_chrom (const void *x, const void *y) {
354   T a = * (T *) x;
355   T b = * (T *) y;
356 
357   return Chrom_cmp_chrom(a,b);
358 }
359 
360 
361 /* For use in table comparisons */
362 int
Chrom_compare_table(const void * x,const void * y)363 Chrom_compare_table (const void *x, const void *y) {
364   T a = (T) x;
365   T b = (T) y;
366 
367   return Chrom_cmp_chrom(a,b);
368 }
369 
370 /* This is the X31 hash function */
371 static unsigned int
string_hash(char * a)372 string_hash (char *a) {
373   unsigned int h = 0U;
374   char *p;
375 
376   for (p = a; *p != '\0'; p++) {
377     h = (h << 5) - h + *p;
378   }
379   return h;
380 }
381 
382 unsigned int
Chrom_hash_table(const void * key)383 Chrom_hash_table (const void *key) {
384   T this = (T) key;
385 
386   return string_hash(this->string);
387 }
388