1 /*
2 This file is part of Primer Pooler (c) Silas S. Brown.  For Wen.
3 
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7 
8     http://www.apache.org/licenses/LICENSE-2.0
9 
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 */
16 #ifdef __APPLE__
17 #define _FORTIFY_SOURCE 0 /* Mac OS 10.7 OpenMP bug workaround */
18 #endif
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <stdint.h>
23 #include <time.h>
24 #include <assert.h>
25 #include "ansi.h"
26 #include "bit-basics.h"
27 #include "debug.h"
28 #include "openmp.h"
29 #include "memcheck.h"
30 #include "numbers.h"
31 
32 #define PARALLELIZE_CHROMOSOMES 1
33 
34 typedef uint32_t b32;
byteSwap32(b32 i)35 static inline b32 byteSwap32(b32 i) {
36   switch(0){case 0:case sizeof(b32)==4:;} /* must be EXACTLY 4 bytes for this code to work */
37   i = (i>>16) | (i<<16);
38   return ((i&0xFF00FF00)>>8) | ((i&0x00FF00FF)<<8);
39 }
read2bSwap(FILE * f,int byteSwap)40 static b32 read2bSwap(FILE *f,int byteSwap) {
41   b32 p; if(!fread(&p,sizeof(p),1,f)) {
42     fputs("Read error\n",stderr); return 0;
43   }
44   if(byteSwap) p=byteSwap32(p);
45   return p;
46 }
read_2bit_nSeqs(FILE * f,int * byteSwap,long * seqPtr)47 static int read_2bit_nSeqs(FILE *f,int *byteSwap,long *seqPtr) {
48   switch(0){case 0:case sizeof(b32)==4:;} /* as above */
49   switch(read2bSwap(f,0)) {
50   case 0x1A412743: *byteSwap=0; break;
51   case 0x4327411A: *byteSwap=1; break;
52   default: fputs("Invalid signature (is this really a .2bit genome file?)\n",stderr); return 0;
53   }
54   if(read2bSwap(f,0 /* *byteSwap, but doesn't matter if we're comparing result against 0 */)) { fputs("Invalid version\n",stderr); return 0; }
55   int seqCount = read2bSwap(f,*byteSwap);
56   read2bSwap(f,0); // reserved (again *byteSwap not nec)
57   *seqPtr = ftell(f); return seqCount;
58 }
59 typedef char SeqName[256];
60 SeqName lastSequenceNameRead={0};
61 int variants_skipped;
is_variant()62 static int is_variant() {
63   if (strchr(lastSequenceNameRead,'_') != NULL
64       || strchr(lastSequenceNameRead,'-') != NULL) {
65     ++variants_skipped;
66     return 1;
67   } else return 0;
68 }
read_2bit_nBases(FILE * f,int byteSwap,long * seqPtr,b32 ** unknownStart,b32 ** unknownLen,b32 * nBases,int * isVariant,int ignoreVars)69 static int read_2bit_nBases(FILE *f,int byteSwap,long *seqPtr,b32* *unknownStart,b32* *unknownLen,b32 *nBases,int *isVariant,int ignoreVars) {
70   if (fseek(f,*seqPtr,SEEK_SET)) { fputs("Seek error reading sequence offset\n",stderr); return 0; }
71   int seqNameLen = getc(f);
72   if(!fread(lastSequenceNameRead,1,seqNameLen,f)) { fputs("Error reading sequence name\n",stderr); return 0; }
73   lastSequenceNameRead[seqNameLen]=0;
74   if(ignoreVars) *isVariant = is_variant();
75   else *isVariant = 0;
76 #ifdef Debug_ChromosomeCheck
77   *isVariant = strcmp(lastSequenceNameRead,Debug_ChromosomeCheck); /* treat any OTHER chromosome as a variant we won't read, for debugging with just one */
78 #endif
79   long offset=read2bSwap(f,byteSwap);
80   if(!offset) { fputs("No sequence offset\n",stderr); return 0; }
81   *seqPtr = ftell(f);
82   if(fseek(f,offset,SEEK_SET)) { fputs("Seek error loading sequence\n",stderr); return 0; }
83   *nBases=read2bSwap(f,byteSwap);
84   #ifdef Debug_ChromosomeCheck
85   if(!*isVariant) fprintf(stderr,"Chromosome size is %d\n",*nBases);
86   #endif
87   b32 nUnknown = read2bSwap(f,byteSwap);
88   *unknownStart = malloc(sizeof(b32)*(nUnknown+1)); /* we'll add an extra one at the end so don't have to keep checking unknownPtr<nUnknown */
89   *unknownLen = malloc(sizeof(b32)*(nUnknown+1));
90   if(memFail(*unknownLen,*unknownStart,_memFail)) return 0;
91   b32 i; for(i=0; i<nUnknown; i++) (*unknownStart)[i]=read2bSwap(f,byteSwap); (*unknownStart)[i]=*nBases;
92   for(i=0; i<nUnknown; i++) { (*unknownLen)[i] = read2bSwap(f,byteSwap); assert((*unknownLen)[i]); } (*unknownLen)[i]=0; /* if get unknownLen==0, need to adjust the 'else' branch of 'baseNo < *unknownStart' below to allow for this possibility (or just remove that block, running mmove on unknownStart to keep it in sync) */
93   if(fseek(f,read2bSwap(f,byteSwap)*4*2 + 4,SEEK_CUR)) // skip 'masked blocks' (TODO: are these ever relevant?) and the 4-byte 'reserved' word
94     { fputs("Seek error skipping mask\n",stderr); return 0; }
95   return 1; // and ftell = start; *seqPtr = next seq, unknown.. needs free
96 }
97 
addBase(bit64 * buf,bit64 * valid,unsigned char byte,int * basesLeftInByte)98 static inline void addBase(bit64 *buf,bit64 *valid,unsigned char byte,int *basesLeftInByte) {
99   /* This function has to be FAST: seriously inner-loop.
100      For ease of binary search in amplicons.c, bases are
101      shifted into buf from LEFT (not right as in 64.h),
102      so buf contains the last few bases IN REVERSE from
103      the genome cursor (which is an 'end' cursor).  */
104   *buf = ((*buf) >> 2) | ((((bit64)byte >> (2*--*basesLeftInByte)) & (bit64)3) << 62);
105   if(*valid != ~(bit64)0) /* (only at start, so put the 'if' around it to save a couple of instructions) */
106     *valid = ((*valid) >> 2) | ((bit64)3 << 62);
107 }
108 
addFastaBase(bit64 * buf,bit64 * valid,char letter)109 static inline void addFastaBase(bit64 *buf,bit64 *valid,char letter) {
110   int basesLeftInByte = 1, twobit;
111   switch(letter) {
112   case 'C': case 'c': twobit = 1; break;
113   case 'A': case 'a': twobit = 2; break;
114   case 'G': case 'g': twobit = 3; break;
115   case ' ': case '\t': case '\r': case '\n': return;
116   default: twobit = 0; /* TODO: might be degenerate bases like 'M'; use these in nonspecific-amplicon checks? (but hopefully they're using .2bit anyway) */
117   }
118   addBase(buf,valid,twobit,&basesLeftInByte);
119 }
120 
121 void look(bit64,bit64,int,b32); /* could pass this into go_through_genome as a pointer, but doing so will slow us down; there's only one in the program so let's let -flto consider it for inlining */
122 int allocateSeqs(int nSeq); /* ditto */
123 int allocateAnotherSeq(int nSeq); /* ditto */
124 
is_fasta(FILE * f)125 static int is_fasta(FILE *f) {
126   // Check if we're looking at a FASTA file instead of a 2bit file.
127   // Assume FASTA will begin with '>' or newline + '>' or BOM + '>'
128   // If FASTA is detected, seek past the first '>'.  Otherwise rewind.
129   char dat[5]; if(!fread(dat,4,1,f)) *dat=0;
130   rewind(f); dat[4]=0;
131   if (strspn(dat,"\r\n\xef\xbb\xbf>")) {
132     while(fgetc(f)!='>') if(feof(f)) {
133         fprintf(stderr,"FASTA file with no sequences??\n");
134         rewind(f); return 0;
135       }
136     return 1;
137   } else return 0;
138 }
139 
readFastaSeqName(FILE * f)140 static void readFastaSeqName(FILE *f) {
141   // assume 'f' is positioned just after the '>'
142   if(!fgets(lastSequenceNameRead,sizeof(SeqName),f))
143     *lastSequenceNameRead = 0;
144   if(lastSequenceNameRead[strlen(lastSequenceNameRead)-1]!='\n') {
145     while (fgetc(f) != '\n') { if(feof(f)) break; }
146   }
147   lastSequenceNameRead[strcspn(lastSequenceNameRead," \t\r\n")]=0;
148 }
takeFastaSeqName(FILE * f,const char * buf)149 static void takeFastaSeqName(FILE *f,const char *buf) {
150   strcpy(lastSequenceNameRead,buf+1); /* ignore '>' */
151   lastSequenceNameRead[strcspn(lastSequenceNameRead," \t\r\n")]=0;
152   if(buf[strlen(buf)-1]!='\n') {
153     while (fgetc(f) != '\n') { if(feof(f)) break; }
154   }
155 }
156 
fasta_genome(FILE * f,int ignoreVars)157 static SeqName* fasta_genome(FILE *f,int ignoreVars) {
158   fprintf(stderr,"Reading genome from FASTA file\n(slower than .2bit; may take time)\n");
159   int seqNo=0; char buf[80];
160   SeqName *seqNames=NULL;
161   while(!feof(f)) {
162     if(seqNo) takeFastaSeqName(f,buf);
163     else readFastaSeqName(f);
164     if(ignoreVars && is_variant()) {
165       while(fgets(buf,sizeof(buf),f) && *buf != '>');
166       continue;
167     }
168     fprintf(stderr,"%s\n",lastSequenceNameRead);
169     SeqName *lastSeqNames = seqNames;
170     seqNames = realloc(seqNames,(seqNo+1)*sizeof(SeqName));
171     if(!seqNames || allocateAnotherSeq(seqNo+1)==seqNo) {
172       free(lastSeqNames);
173       fprintf(stderr,"Genome metadata: Out of memory!\n"); break;
174     }
175     wrapped_memcpy(seqNames[seqNo],lastSequenceNameRead,sizeof(SeqName));
176     bit64 curBuf=0,curValid=0; b32 baseNo=0;
177     while(fgets(buf,sizeof(buf),f) && *buf != '>') {
178       char *b;
179       for(b=buf; *b; b++) {
180         switch (*b) {
181         case 'N': case 'n': curBuf = curValid = 0; break;
182         default:
183           addFastaBase(&curBuf,&curValid,*b);
184           look(curBuf,curValid,seqNo,baseNo);
185         } baseNo++;
186       }
187     }
188     seqNo++;
189   }
190   fprintf(stderr,"End of FASTA genome scan\n"); return seqNames;
191 }
192 
go_through_genome(FILE * f,int ignoreVars)193 SeqName* go_through_genome(FILE *f,int ignoreVars) {
194   variants_skipped = 0;
195   if(is_fasta(f)) return fasta_genome(f,ignoreVars);
196   int byteSwap=0; long seqPtr=0; // =0 to suppress warning
197   int nSeq=read_2bit_nSeqs(f,&byteSwap,&seqPtr);
198   if(!allocateSeqs(nSeq)) return NULL;
199   int seqsDone = 0;
200   SeqName *seqNames=NULL; int numSeqNames=0;
201   time_t start = time(NULL);
202   time_t nextDisplay = start + 1;
203   if(omp_get_max_threads() > 1) {
204     fprintf(stderr,"Parallelising scan (up to %d chromosomes simultaneously)\n",omp_get_max_threads());
205   }
206   int seqNo; char progressBuf[80]={0}; /* TODO: different screen widths?  (low priority because it could just scroll, but 8 cores on a narrow terminal could be messy) */ enum { ProgWidthPerThread = 13 /* sequence name width is this minus 5; try to divide into the screen width and also allow for the possiblity of narrower terminals */ };
207   #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
208   #pragma omp parallel for schedule(dynamic)
209   #endif
210   for(seqNo=0;seqNo<nSeq;seqNo++) {
211     time_t nextThreadUpdate = time(NULL)+1; int tNum = omp_get_thread_num(); char *pgBuf = ((tNum+1)*ProgWidthPerThread > sizeof(progressBuf)) ? NULL : (progressBuf+tNum*ProgWidthPerThread); int isRHS=(tNum==omp_get_num_threads()-1) || ((tNum+2)*ProgWidthPerThread)>sizeof(progressBuf);
212     b32 *allUnknownStart=0,*allUnknownLen=0,nBases=0; /* =0 for old compilers (don't warn) */
213     int isVariant,renumberedSeqNo=0; /* =0 for old compilers (don't warn) */
214     #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
215     long ft=0; /* =0 for old compilers (don't warn) */
216     #endif
217     b32 baseNo = 0, *unknownStart=0, *unknownLen=0; /* last two =0 for old compilers */
218     int bufBytes=0;/* =0 for old compilers (don't warn) */
219     char *buf=NULL;
220     if(numSeqNames && !seqNames) continue; /* would have been 'break' below w/out OpenMP */
221     #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
222     #pragma omp critical
223     #endif
224     {
225     if(read_2bit_nBases(f,byteSwap,&seqPtr,&allUnknownStart,&allUnknownLen,&nBases,&isVariant,ignoreVars)) {
226     if(isVariant) { free(allUnknownStart); free(allUnknownLen); } else {
227     if((renumberedSeqNo = seqsDone++) >= numSeqNames) {
228       numSeqNames = (numSeqNames+1)<<1;
229       seqNames = realloc(seqNames,numSeqNames*sizeof(SeqName)); memFail(seqNames,allUnknownLen,allUnknownStart,_memFail);
230     } if(seqNames) wrapped_memcpy(seqNames[renumberedSeqNo],lastSequenceNameRead,sizeof(SeqName));
231     unknownStart=allUnknownStart;unknownLen=allUnknownLen;
232     bufBytes = nBases/4;if(bufBytes) buf=malloc(bufBytes);
233     if(buf) {
234       if(!fread(buf,1,bufBytes,f)) {
235         fprintf(stderr,"\nError reading %d bytes (current pointer is &%lx)\nCorrupt genome file?\n",bufBytes,ftell(f));
236         free(buf); if(seqNames) free(seqNames); seqNames=NULL; /* would be 'return NULL' w/out OpenMP */
237       }
238     } else bufBytes=0;
239     #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
240     if(f) ft = ftell(f);
241     #endif
242     } } else isVariant=1; }
243     if(isVariant || !seqNames) continue;
244     int bufPtr = 0;
245     bit64 curBuf=0,curValid=0; int basesLeftInByte = 0;
246     unsigned char byte = 0; // =0 to suppress warning
247     while(baseNo<nBases) {
248       /* OK, let's go through all the bases.  And try not
249          to think of the daft 2001-ish "Internet meme".
250          TODO: it might be possible to manually unroll
251          this loop a bit so fewer tests are needed on
252          baseNo etc. */
253       if(!basesLeftInByte) {
254         if(bufPtr==bufBytes)
255           #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
256           #pragma omp critical
257           #endif
258           {
259           #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
260             fseek(f,ft++,SEEK_SET);
261           #endif
262             byte = getc(f);
263           } else byte=buf[bufPtr++];
264         basesLeftInByte = 4;
265         if(pgBuf && time(NULL) >= nextThreadUpdate)
266           #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
267           #pragma omp critical
268           #endif
269           {
270           int outputted=sprintf(pgBuf,"%*s %2d%%",ProgWidthPerThread-5,seqNames[renumberedSeqNo],(int)((float)baseNo*100.0/(float)nBases));
271           if(!isRHS) pgBuf[outputted]=' ';
272           nextThreadUpdate = time(NULL)+1;
273         }
274         if(time(NULL) >= nextDisplay)
275           #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
276           #pragma omp critical
277           #endif
278           // (this comment added to work around an auto-indent bug in some Emacs versions)
279           {
280           if(time(NULL) >= nextDisplay) {
281           if(omp_get_num_threads()==1) fputs("\rScanning ",stderr);
282           else fputs("\r",stderr); /* Scanning message will have been printed above, and we have many columns to worry about */
283           fputs(progressBuf,stderr); fflush(stderr);
284           nextDisplay = time(NULL) + 2;
285           }}
286       }
287       if(baseNo < *unknownStart) {
288         addBase(&curBuf,&curValid,byte,&basesLeftInByte);
289         look(curBuf,curValid,renumberedSeqNo,++baseNo);
290       } else { /* we're in an 'unknown' region */
291         --basesLeftInByte; /* ignore this one */
292         if(++baseNo == *unknownStart + *unknownLen) {
293           unknownStart++; unknownLen++;
294           curBuf = curValid = 0;
295         }
296       }
297     }
298     if(buf) free(buf);
299     free(allUnknownStart); free(allUnknownLen);
300     if(pgBuf) memset(pgBuf,' ',ProgWidthPerThread-isRHS);
301   }
302   fprintf(stderr,"\rGenome scan complete");
303   prnSeconds((long)(time(NULL)-start));
304   fputs(clearEOL(),stderr);
305   fputs("\n",stderr);
306   return seqNames;
307 }
308 
output_genome_segment(FILE * f,int targetRenumberedSeqNo,b32 baseStart,int nBases,FILE * out,int ignoreVars)309 void output_genome_segment(FILE *f,int targetRenumberedSeqNo,b32 baseStart,int nBases,FILE *out,int ignoreVars) {
310   /* Cut-down version of go_through_genome for use in reports */
311   int byteSwap=0; long seqPtr=0;
312   rewind(f);
313   int nSeq=read_2bit_nSeqs(f,&byteSwap,&seqPtr);
314   int seqsDone = 0, seqNo;
315   for(seqNo=0;seqNo<nSeq;seqNo++) {
316     int isVariant;
317     b32 *allUnknownStart=0,*allUnknownLen=0, nb0;
318     if(read_2bit_nBases(f,byteSwap,&seqPtr,&allUnknownStart,&allUnknownLen,&nb0,&isVariant,ignoreVars)) {
319     free(allUnknownStart); free(allUnknownLen);
320     if (!isVariant && (seqsDone++ == targetRenumberedSeqNo)) {
321       fseek(f,--baseStart/4,SEEK_CUR); // 1st is 0 not 1
322       int shift = 2*(baseStart & 3);
323       int mask = (128|64) >> shift; shift = 6 - shift;
324       do {
325         unsigned char byte = getc(f);
326         while(mask && nBases) {
327           fputc("TCAG"[(byte >> shift) & 3], out);
328           nBases--; shift-=2; mask>>=2;
329         }
330         mask = 128|64; shift = 6;
331       } while(nBases);
332       break;
333     }}
334   }
335 }
336