1 /* getseq.c */
2
3 /* copyright (c) 1987,1988,1989,1992,1995,2000, 2014 by William R. Pearson
4 and The Rectors and Visitors of the University of Virginia */
5
6 /* Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing,
13 software distributed under this License is distributed on an "AS
14 IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
15 express or implied. See the License for the specific language
16 governing permissions and limitations under the License.
17 */
18
19 /* May, June 1987 - modified for rapid read of database
20 This is one of three alternative files that can be used to
21 read a database. The three files are nxgetaa.c, nmgetaa.c, and
22 mmgetaa.c.
23
24 nxgetaa.c contains the original code for reading databases, and
25 is still used for Mac and PC versions of fasta33 (which do not
26 use mmap).
27
28 nmgetaa.c and mmgetaa.c are used together. nmgetaa.c provides
29 the same functions as nxgetaa.c if memory mapping is not used,
30 mmgetaa.c provides the database reading functions if memory
31 mapping is used. The decision to use memory mapping is made on
32 a file-by-file basis.
33
34 June 2, 1987 - added TFASTA
35 March 30, 1988 - combined ffgetaa, fgetgb;
36 April 8, 1988 - added PIRLIB format for unix
37 Feb 4, 1989 - added universal subroutines for libraries
38 December, 1995 - added range option file.name:1-1000
39 Feb 22, 2002 - fix to allow "plain" text file queries
40
41 getnt.c associated subroutines for matching sequences */
42
43 /* $Id: getseq.c 625 2011-03-23 17:21:38Z wrp $ */
44 /* $Revision: 625 $ */
45
46 /*
47 8-April-88
48 The compile time #define PIRLIB allows this routine to be used
49 to read protein and DNA sequence libraries in the NBRF/PIR
50 VAX/VMS library format. That is:
51
52 >P1;LCBO
53 This is a line of description
54 GTYH ... the sequence starts on this line
55
56 This may ease conversion from UWGCG format libraries. It
57 has not been extensively tested.
58
59 In addition, sequence libraries with a '>' in the 4th position
60 are recognized as NBRF format libraries for consistency with
61 UWGCG
62 */
63
64 /* Nov 12, 1987 - this version checks to see if the sequence
65 is DNA or protein by asking whether > 85% is A, C, G, T
66
67 May 5, 1988 - modify the DNA/PROTEIN checker by re-reading
68 DNA sequences in order to check for 'U'.
69 */
70
71 #include <stdio.h>
72 #include <stdlib.h>
73 #include <string.h>
74
75 #include "defs.h"
76 #include "structs.h"
77
78 #ifndef SFCHAR
79 #define SFCHAR ':'
80 #endif
81
82 #define XTERNAL
83 #include "uascii.h"
84 #include "upam.h"
85 #undef XTERNAL
86
87 #define YES 1
88 #define NO 0
89 #define MAXLINE 512
90
91 #ifndef min
92 #define min(x,y) ((x) > (y) ? (y) : (x))
93 #endif
94
95 #define NO_FORMAT 0
96 #define FASTA_FORMAT 1
97 #define GCG_FORMAT 2
98
99 static int seq_format=NO_FORMAT;
100 static char seq_title[200];
101
102 int scanseq(unsigned char *, int, char *);
103 void sf_sort(int *, int);
104 extern void init_ascii(int is_ext, int *sascii, int is_dna);
105
106 /* getseq - get a query sequence, possibly re-reading to set type
107 returns - length of query sequence or error = 0
108
109 char *filen - name of file to be opened
110 char *seq - destination for query sequence
111 int maxs - maximum length of query
112 char libstr[20] - short description (locus or acc)
113 int *dnaseq - -1 => use scanseq to determine sequence type
114 0 => must be protein
115 1 => must be DNA
116 long *sq0off - offset into query specified by query_file:1001-2000
117 */
118
119 int
getseq(char * filen,int * qascii,unsigned char * seq,int maxs,char * libstr,long * sq0off)120 getseq(char *filen, int *qascii, unsigned char *seq, int maxs, char *libstr, long *sq0off)
121 {
122 FILE *fptr;
123 char line[512],*bp, *bp1, *bpn, *tp;
124 int i, rn, n;
125 int ic;
126 int sstart, sstop, sset=0;
127 int llen, l_offset;
128
129 seq_title[0]='\0';
130 libstr[0]='\0';
131
132 sstart = sstop = -1;
133 #ifndef DOS
134 if ((bp=strchr(filen,':'))!=NULL && *(bp+1)!='\0') {
135 #else
136 if ((bp=strchr(filen+3,':'))!=NULL && *(bp+1)!='\0') {
137 #endif
138 *bp='\0';
139 if (*(bp+1)=='-') {
140 sstart = 0;
141 sscanf(bp+2,"%d",&sstop);
142 }
143 else {
144 sscanf(bp+1,"%d-%d",&sstart,&sstop);
145 sstart--;
146 if (sstop <= 0 ) sstop = BIGNUM;
147 }
148 sset=1;
149 }
150 else {
151 sstart = 0;
152 sstop = BIGNUM;
153 }
154
155 /* check for input from stdin */
156 if (strcmp(filen,"-") && strcmp(filen,"@")) {
157 if ((fptr=fopen(filen,"r"))==NULL) {
158 fprintf(stderr," could not open %s\n",filen);
159 return 0;
160 }
161 }
162 else {
163 fptr = stdin;
164 }
165 rn = n=0;
166
167 while(fgets(line,sizeof(line),fptr)!=NULL) {
168 l_offset = 0;
169 if (line[0]=='>') {
170 seq_format = FASTA_FORMAT;
171 if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
172 strncpy(seq_title,line+1,sizeof(seq_title));
173 seq_title[sizeof(seq_title)-1]='\0';
174 if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
175 strncpy(libstr,line+1,12);
176 libstr[12]='\0';
177 }
178 else if (seq_format==NO_FORMAT && strcmp(line,"..")==0) {
179 seq_format = GCG_FORMAT;
180 /*
181 if (*dnaseq != 1) qascii['*'] = qascii['X'];
182 */
183 l_offset = 10;
184 llen = strlen(line);
185 while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
186 if (fgets(line,sizeof(line),fptr)==NULL) return 0;
187 llen = strlen(line);
188 }
189 bp = strtok(line," \t");
190 /*
191 if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
192 else if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
193 */
194 if (bp!=NULL) strncpy(libstr,bp,12);
195 else strncpy(libstr,filen,12);
196 libstr[12]='\0';
197 if (fgets(line,sizeof(line),fptr)==NULL) return 0;
198 }
199 else {
200 if (libstr[0]=='\0') strncpy(libstr,filen,12);
201 libstr[12]='\0';
202 }
203
204 if (seq_format==GCG_FORMAT && strlen(line)<l_offset) continue;
205
206 if (line[0]!='>'&& line[0]!=';') {
207 for (i=l_offset; (n<maxs && rn < sstop)&&
208 ((ic=qascii[line[i]&AAMASK])<EL); i++)
209 if (ic<NA && ++rn > sstart) seq[n++]= ic;
210 if (ic == ES || rn > sstop) break;
211 }
212 }
213
214 if (n==maxs) {
215 fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
216 fflush(stderr);
217 }
218 if ((bp=strchr(libstr,'\n'))!=NULL) *bp = '\0';
219 if ((bp=strchr(libstr,'\r'))!=NULL) *bp = '\0';
220 seq[n]= EOSEQ;
221
222
223 if (seq_format !=GCG_FORMAT)
224 while(fgets(line,sizeof(line),fptr)!=NULL) {
225 if (line[0]!='>'&& line[0]!=';') {
226 for (i=0; (n<maxs && rn < sstop)&&
227 ((ic=qascii[line[i]&AAMASK])<EL); i++)
228 if (ic<NA && ++rn > sstart ) seq[n++]= ic;
229 if (ic == ES || rn > sstop) break;
230 }
231 }
232 else {
233 llen = strlen(line);
234 while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
235 if (fgets(line,sizeof(line),fptr)==NULL) return 0;
236 llen = strlen(line);
237 }
238 while (fgets(line,sizeof(line),fptr)!=NULL) {
239 if (strlen(line)<l_offset) continue;
240 for (i=l_offset; (n<maxs && rn < sstop) &&
241 ((ic=qascii[line[i]&AAMASK])<EL); i++)
242 if (ic<NA && ++rn > sstart ) seq[n++]= ic;
243 if (ic == ES || rn > sstop ) break;
244 }
245 }
246
247 if (n==maxs) {
248 fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
249 fflush(stderr);
250 }
251 seq[n]= EOSEQ;
252
253 if (fptr!=stdin) fclose(fptr);
254
255 if (sset==1) {
256 sstart++;
257 filen[strlen(filen)]=':';
258 if (*sq0off==1 || sstart>=1) *sq0off = sstart;
259 }
260
261 return n;
262 }
263
264 int
265 gettitle(char *filen, char *title, int len) {
266 FILE *fptr;
267 char line[512];
268 char *bp;
269 int sset;
270 #ifdef WIN32
271 char *strpbrk();
272 #endif
273
274 sset = 0;
275
276 if (strncmp(filen,"-",1)==0 || strncmp(filen,"@",1)==0) {
277 strncpy(title,seq_title,len);
278 title[len-1]='\0';
279 return (int)strlen(title);
280 }
281
282 if ((bp=strchr(filen,':'))!=NULL) { *bp='\0'; sset=1;}
283
284
285 if ((fptr=fopen(filen,"r"))==NULL) {
286 fprintf(stderr," file %s was not found\n",filen);
287 fflush(stderr);
288 return 0;
289 }
290
291 if (sset==1) filen[strlen(filen)]=':';
292
293 while(fgets(line,sizeof(line),fptr)!=NULL) {
294 if (line[0]=='>'|| line[0]==';') goto found;
295 }
296 fclose(fptr);
297 title[0]='\0';
298 return 0;
299
300 found:
301
302 #ifdef WIN32
303 bp = strpbrk(line,"\n\r");
304 #else
305 bp = strchr(line,'\n');
306 #endif
307 if (bp!=NULL) *bp = 0;
308 strncpy(title,line,len);
309 title[len-1]='\0';
310 fclose(fptr);
311 return strlen(title);
312 }
313
314