1 /*
2 This file is part of Primer Pooler (c) Silas S. Brown. For Wen.
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 */
16 #ifdef __APPLE__
17 #define _FORTIFY_SOURCE 0 /* Mac OS 10.7 OpenMP bug workaround */
18 #endif
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <stdint.h>
23 #include <time.h>
24 #include <assert.h>
25 #include "ansi.h"
26 #include "bit-basics.h"
27 #include "debug.h"
28 #include "openmp.h"
29 #include "memcheck.h"
30 #include "numbers.h"
31
32 #define PARALLELIZE_CHROMOSOMES 1
33
34 typedef uint32_t b32;
byteSwap32(b32 i)35 static inline b32 byteSwap32(b32 i) {
36 switch(0){case 0:case sizeof(b32)==4:;} /* must be EXACTLY 4 bytes for this code to work */
37 i = (i>>16) | (i<<16);
38 return ((i&0xFF00FF00)>>8) | ((i&0x00FF00FF)<<8);
39 }
read2bSwap(FILE * f,int byteSwap)40 static b32 read2bSwap(FILE *f,int byteSwap) {
41 b32 p; if(!fread(&p,sizeof(p),1,f)) {
42 fputs("Read error\n",stderr); return 0;
43 }
44 if(byteSwap) p=byteSwap32(p);
45 return p;
46 }
read_2bit_nSeqs(FILE * f,int * byteSwap,long * seqPtr)47 static int read_2bit_nSeqs(FILE *f,int *byteSwap,long *seqPtr) {
48 switch(0){case 0:case sizeof(b32)==4:;} /* as above */
49 switch(read2bSwap(f,0)) {
50 case 0x1A412743: *byteSwap=0; break;
51 case 0x4327411A: *byteSwap=1; break;
52 default: fputs("Invalid signature (is this really a .2bit genome file?)\n",stderr); return 0;
53 }
54 if(read2bSwap(f,0 /* *byteSwap, but doesn't matter if we're comparing result against 0 */)) { fputs("Invalid version\n",stderr); return 0; }
55 int seqCount = read2bSwap(f,*byteSwap);
56 read2bSwap(f,0); // reserved (again *byteSwap not nec)
57 *seqPtr = ftell(f); return seqCount;
58 }
59 typedef char SeqName[256];
60 SeqName lastSequenceNameRead={0};
61 int variants_skipped;
is_variant()62 static int is_variant() {
63 if (strchr(lastSequenceNameRead,'_') != NULL
64 || strchr(lastSequenceNameRead,'-') != NULL) {
65 ++variants_skipped;
66 return 1;
67 } else return 0;
68 }
read_2bit_nBases(FILE * f,int byteSwap,long * seqPtr,b32 ** unknownStart,b32 ** unknownLen,b32 * nBases,int * isVariant,int ignoreVars)69 static int read_2bit_nBases(FILE *f,int byteSwap,long *seqPtr,b32* *unknownStart,b32* *unknownLen,b32 *nBases,int *isVariant,int ignoreVars) {
70 if (fseek(f,*seqPtr,SEEK_SET)) { fputs("Seek error reading sequence offset\n",stderr); return 0; }
71 int seqNameLen = getc(f);
72 if(!fread(lastSequenceNameRead,1,seqNameLen,f)) { fputs("Error reading sequence name\n",stderr); return 0; }
73 lastSequenceNameRead[seqNameLen]=0;
74 if(ignoreVars) *isVariant = is_variant();
75 else *isVariant = 0;
76 #ifdef Debug_ChromosomeCheck
77 *isVariant = strcmp(lastSequenceNameRead,Debug_ChromosomeCheck); /* treat any OTHER chromosome as a variant we won't read, for debugging with just one */
78 #endif
79 long offset=read2bSwap(f,byteSwap);
80 if(!offset) { fputs("No sequence offset\n",stderr); return 0; }
81 *seqPtr = ftell(f);
82 if(fseek(f,offset,SEEK_SET)) { fputs("Seek error loading sequence\n",stderr); return 0; }
83 *nBases=read2bSwap(f,byteSwap);
84 #ifdef Debug_ChromosomeCheck
85 if(!*isVariant) fprintf(stderr,"Chromosome size is %d\n",*nBases);
86 #endif
87 b32 nUnknown = read2bSwap(f,byteSwap);
88 *unknownStart = malloc(sizeof(b32)*(nUnknown+1)); /* we'll add an extra one at the end so don't have to keep checking unknownPtr<nUnknown */
89 *unknownLen = malloc(sizeof(b32)*(nUnknown+1));
90 if(memFail(*unknownLen,*unknownStart,_memFail)) return 0;
91 b32 i; for(i=0; i<nUnknown; i++) (*unknownStart)[i]=read2bSwap(f,byteSwap); (*unknownStart)[i]=*nBases;
92 for(i=0; i<nUnknown; i++) { (*unknownLen)[i] = read2bSwap(f,byteSwap); assert((*unknownLen)[i]); } (*unknownLen)[i]=0; /* if get unknownLen==0, need to adjust the 'else' branch of 'baseNo < *unknownStart' below to allow for this possibility (or just remove that block, running mmove on unknownStart to keep it in sync) */
93 if(fseek(f,read2bSwap(f,byteSwap)*4*2 + 4,SEEK_CUR)) // skip 'masked blocks' (TODO: are these ever relevant?) and the 4-byte 'reserved' word
94 { fputs("Seek error skipping mask\n",stderr); return 0; }
95 return 1; // and ftell = start; *seqPtr = next seq, unknown.. needs free
96 }
97
addBase(bit64 * buf,bit64 * valid,unsigned char byte,int * basesLeftInByte)98 static inline void addBase(bit64 *buf,bit64 *valid,unsigned char byte,int *basesLeftInByte) {
99 /* This function has to be FAST: seriously inner-loop.
100 For ease of binary search in amplicons.c, bases are
101 shifted into buf from LEFT (not right as in 64.h),
102 so buf contains the last few bases IN REVERSE from
103 the genome cursor (which is an 'end' cursor). */
104 *buf = ((*buf) >> 2) | ((((bit64)byte >> (2*--*basesLeftInByte)) & (bit64)3) << 62);
105 if(*valid != ~(bit64)0) /* (only at start, so put the 'if' around it to save a couple of instructions) */
106 *valid = ((*valid) >> 2) | ((bit64)3 << 62);
107 }
108
addFastaBase(bit64 * buf,bit64 * valid,char letter)109 static inline void addFastaBase(bit64 *buf,bit64 *valid,char letter) {
110 int basesLeftInByte = 1, twobit;
111 switch(letter) {
112 case 'C': case 'c': twobit = 1; break;
113 case 'A': case 'a': twobit = 2; break;
114 case 'G': case 'g': twobit = 3; break;
115 case ' ': case '\t': case '\r': case '\n': return;
116 default: twobit = 0; /* TODO: might be degenerate bases like 'M'; use these in nonspecific-amplicon checks? (but hopefully they're using .2bit anyway) */
117 }
118 addBase(buf,valid,twobit,&basesLeftInByte);
119 }
120
121 void look(bit64,bit64,int,b32); /* could pass this into go_through_genome as a pointer, but doing so will slow us down; there's only one in the program so let's let -flto consider it for inlining */
122 int allocateSeqs(int nSeq); /* ditto */
123 int allocateAnotherSeq(int nSeq); /* ditto */
124
is_fasta(FILE * f)125 static int is_fasta(FILE *f) {
126 // Check if we're looking at a FASTA file instead of a 2bit file.
127 // Assume FASTA will begin with '>' or newline + '>' or BOM + '>'
128 // If FASTA is detected, seek past the first '>'. Otherwise rewind.
129 char dat[5]; if(!fread(dat,4,1,f)) *dat=0;
130 rewind(f); dat[4]=0;
131 if (strspn(dat,"\r\n\xef\xbb\xbf>")) {
132 while(fgetc(f)!='>') if(feof(f)) {
133 fprintf(stderr,"FASTA file with no sequences??\n");
134 rewind(f); return 0;
135 }
136 return 1;
137 } else return 0;
138 }
139
readFastaSeqName(FILE * f)140 static void readFastaSeqName(FILE *f) {
141 // assume 'f' is positioned just after the '>'
142 if(!fgets(lastSequenceNameRead,sizeof(SeqName),f))
143 *lastSequenceNameRead = 0;
144 if(lastSequenceNameRead[strlen(lastSequenceNameRead)-1]!='\n') {
145 while (fgetc(f) != '\n') { if(feof(f)) break; }
146 }
147 lastSequenceNameRead[strcspn(lastSequenceNameRead," \t\r\n")]=0;
148 }
takeFastaSeqName(FILE * f,const char * buf)149 static void takeFastaSeqName(FILE *f,const char *buf) {
150 strcpy(lastSequenceNameRead,buf+1); /* ignore '>' */
151 lastSequenceNameRead[strcspn(lastSequenceNameRead," \t\r\n")]=0;
152 if(buf[strlen(buf)-1]!='\n') {
153 while (fgetc(f) != '\n') { if(feof(f)) break; }
154 }
155 }
156
fasta_genome(FILE * f,int ignoreVars)157 static SeqName* fasta_genome(FILE *f,int ignoreVars) {
158 fprintf(stderr,"Reading genome from FASTA file\n(slower than .2bit; may take time)\n");
159 int seqNo=0; char buf[80];
160 SeqName *seqNames=NULL;
161 while(!feof(f)) {
162 if(seqNo) takeFastaSeqName(f,buf);
163 else readFastaSeqName(f);
164 if(ignoreVars && is_variant()) {
165 while(fgets(buf,sizeof(buf),f) && *buf != '>');
166 continue;
167 }
168 fprintf(stderr,"%s\n",lastSequenceNameRead);
169 SeqName *lastSeqNames = seqNames;
170 seqNames = realloc(seqNames,(seqNo+1)*sizeof(SeqName));
171 if(!seqNames || allocateAnotherSeq(seqNo+1)==seqNo) {
172 free(lastSeqNames);
173 fprintf(stderr,"Genome metadata: Out of memory!\n"); break;
174 }
175 wrapped_memcpy(seqNames[seqNo],lastSequenceNameRead,sizeof(SeqName));
176 bit64 curBuf=0,curValid=0; b32 baseNo=0;
177 while(fgets(buf,sizeof(buf),f) && *buf != '>') {
178 char *b;
179 for(b=buf; *b; b++) {
180 switch (*b) {
181 case 'N': case 'n': curBuf = curValid = 0; break;
182 default:
183 addFastaBase(&curBuf,&curValid,*b);
184 look(curBuf,curValid,seqNo,baseNo);
185 } baseNo++;
186 }
187 }
188 seqNo++;
189 }
190 fprintf(stderr,"End of FASTA genome scan\n"); return seqNames;
191 }
192
go_through_genome(FILE * f,int ignoreVars)193 SeqName* go_through_genome(FILE *f,int ignoreVars) {
194 variants_skipped = 0;
195 if(is_fasta(f)) return fasta_genome(f,ignoreVars);
196 int byteSwap=0; long seqPtr=0; // =0 to suppress warning
197 int nSeq=read_2bit_nSeqs(f,&byteSwap,&seqPtr);
198 if(!allocateSeqs(nSeq)) return NULL;
199 int seqsDone = 0;
200 SeqName *seqNames=NULL; int numSeqNames=0;
201 time_t start = time(NULL);
202 time_t nextDisplay = start + 1;
203 if(omp_get_max_threads() > 1) {
204 fprintf(stderr,"Parallelising scan (up to %d chromosomes simultaneously)\n",omp_get_max_threads());
205 }
206 int seqNo; char progressBuf[80]={0}; /* TODO: different screen widths? (low priority because it could just scroll, but 8 cores on a narrow terminal could be messy) */ enum { ProgWidthPerThread = 13 /* sequence name width is this minus 5; try to divide into the screen width and also allow for the possiblity of narrower terminals */ };
207 #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
208 #pragma omp parallel for schedule(dynamic)
209 #endif
210 for(seqNo=0;seqNo<nSeq;seqNo++) {
211 time_t nextThreadUpdate = time(NULL)+1; int tNum = omp_get_thread_num(); char *pgBuf = ((tNum+1)*ProgWidthPerThread > sizeof(progressBuf)) ? NULL : (progressBuf+tNum*ProgWidthPerThread); int isRHS=(tNum==omp_get_num_threads()-1) || ((tNum+2)*ProgWidthPerThread)>sizeof(progressBuf);
212 b32 *allUnknownStart=0,*allUnknownLen=0,nBases=0; /* =0 for old compilers (don't warn) */
213 int isVariant,renumberedSeqNo=0; /* =0 for old compilers (don't warn) */
214 #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
215 long ft=0; /* =0 for old compilers (don't warn) */
216 #endif
217 b32 baseNo = 0, *unknownStart=0, *unknownLen=0; /* last two =0 for old compilers */
218 int bufBytes=0;/* =0 for old compilers (don't warn) */
219 char *buf=NULL;
220 if(numSeqNames && !seqNames) continue; /* would have been 'break' below w/out OpenMP */
221 #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
222 #pragma omp critical
223 #endif
224 {
225 if(read_2bit_nBases(f,byteSwap,&seqPtr,&allUnknownStart,&allUnknownLen,&nBases,&isVariant,ignoreVars)) {
226 if(isVariant) { free(allUnknownStart); free(allUnknownLen); } else {
227 if((renumberedSeqNo = seqsDone++) >= numSeqNames) {
228 numSeqNames = (numSeqNames+1)<<1;
229 seqNames = realloc(seqNames,numSeqNames*sizeof(SeqName)); memFail(seqNames,allUnknownLen,allUnknownStart,_memFail);
230 } if(seqNames) wrapped_memcpy(seqNames[renumberedSeqNo],lastSequenceNameRead,sizeof(SeqName));
231 unknownStart=allUnknownStart;unknownLen=allUnknownLen;
232 bufBytes = nBases/4;if(bufBytes) buf=malloc(bufBytes);
233 if(buf) {
234 if(!fread(buf,1,bufBytes,f)) {
235 fprintf(stderr,"\nError reading %d bytes (current pointer is &%lx)\nCorrupt genome file?\n",bufBytes,ftell(f));
236 free(buf); if(seqNames) free(seqNames); seqNames=NULL; /* would be 'return NULL' w/out OpenMP */
237 }
238 } else bufBytes=0;
239 #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
240 if(f) ft = ftell(f);
241 #endif
242 } } else isVariant=1; }
243 if(isVariant || !seqNames) continue;
244 int bufPtr = 0;
245 bit64 curBuf=0,curValid=0; int basesLeftInByte = 0;
246 unsigned char byte = 0; // =0 to suppress warning
247 while(baseNo<nBases) {
248 /* OK, let's go through all the bases. And try not
249 to think of the daft 2001-ish "Internet meme".
250 TODO: it might be possible to manually unroll
251 this loop a bit so fewer tests are needed on
252 baseNo etc. */
253 if(!basesLeftInByte) {
254 if(bufPtr==bufBytes)
255 #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
256 #pragma omp critical
257 #endif
258 {
259 #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
260 fseek(f,ft++,SEEK_SET);
261 #endif
262 byte = getc(f);
263 } else byte=buf[bufPtr++];
264 basesLeftInByte = 4;
265 if(pgBuf && time(NULL) >= nextThreadUpdate)
266 #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
267 #pragma omp critical
268 #endif
269 {
270 int outputted=sprintf(pgBuf,"%*s %2d%%",ProgWidthPerThread-5,seqNames[renumberedSeqNo],(int)((float)baseNo*100.0/(float)nBases));
271 if(!isRHS) pgBuf[outputted]=' ';
272 nextThreadUpdate = time(NULL)+1;
273 }
274 if(time(NULL) >= nextDisplay)
275 #if PARALLELIZE_CHROMOSOMES && defined(_OPENMP)
276 #pragma omp critical
277 #endif
278 // (this comment added to work around an auto-indent bug in some Emacs versions)
279 {
280 if(time(NULL) >= nextDisplay) {
281 if(omp_get_num_threads()==1) fputs("\rScanning ",stderr);
282 else fputs("\r",stderr); /* Scanning message will have been printed above, and we have many columns to worry about */
283 fputs(progressBuf,stderr); fflush(stderr);
284 nextDisplay = time(NULL) + 2;
285 }}
286 }
287 if(baseNo < *unknownStart) {
288 addBase(&curBuf,&curValid,byte,&basesLeftInByte);
289 look(curBuf,curValid,renumberedSeqNo,++baseNo);
290 } else { /* we're in an 'unknown' region */
291 --basesLeftInByte; /* ignore this one */
292 if(++baseNo == *unknownStart + *unknownLen) {
293 unknownStart++; unknownLen++;
294 curBuf = curValid = 0;
295 }
296 }
297 }
298 if(buf) free(buf);
299 free(allUnknownStart); free(allUnknownLen);
300 if(pgBuf) memset(pgBuf,' ',ProgWidthPerThread-isRHS);
301 }
302 fprintf(stderr,"\rGenome scan complete");
303 prnSeconds((long)(time(NULL)-start));
304 fputs(clearEOL(),stderr);
305 fputs("\n",stderr);
306 return seqNames;
307 }
308
output_genome_segment(FILE * f,int targetRenumberedSeqNo,b32 baseStart,int nBases,FILE * out,int ignoreVars)309 void output_genome_segment(FILE *f,int targetRenumberedSeqNo,b32 baseStart,int nBases,FILE *out,int ignoreVars) {
310 /* Cut-down version of go_through_genome for use in reports */
311 int byteSwap=0; long seqPtr=0;
312 rewind(f);
313 int nSeq=read_2bit_nSeqs(f,&byteSwap,&seqPtr);
314 int seqsDone = 0, seqNo;
315 for(seqNo=0;seqNo<nSeq;seqNo++) {
316 int isVariant;
317 b32 *allUnknownStart=0,*allUnknownLen=0, nb0;
318 if(read_2bit_nBases(f,byteSwap,&seqPtr,&allUnknownStart,&allUnknownLen,&nb0,&isVariant,ignoreVars)) {
319 free(allUnknownStart); free(allUnknownLen);
320 if (!isVariant && (seqsDone++ == targetRenumberedSeqNo)) {
321 fseek(f,--baseStart/4,SEEK_CUR); // 1st is 0 not 1
322 int shift = 2*(baseStart & 3);
323 int mask = (128|64) >> shift; shift = 6 - shift;
324 do {
325 unsigned char byte = getc(f);
326 while(mask && nBases) {
327 fputc("TCAG"[(byte >> shift) & 3], out);
328 nBases--; shift-=2; mask>>=2;
329 }
330 mask = 128|64; shift = 6;
331 } while(nBases);
332 break;
333 }}
334 }
335 }
336