1 /* getseq.c */
2 
3 /* copyright (c) 1987,1988,1989,1992,1995,2000, 2014 by William R. Pearson
4    and The Rectors and Visitors of the University of Virginia */
5 
6 /* Licensed under the Apache License, Version 2.0 (the "License");
7    you may not use this file except in compliance with the License.
8    You may obtain a copy of the License at
9 
10    http://www.apache.org/licenses/LICENSE-2.0
11 
12    Unless required by applicable law or agreed to in writing,
13    software distributed under this License is distributed on an "AS
14    IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
15    express or implied.  See the License for the specific language
16    governing permissions and limitations under the License.
17 */
18 
19 /*	May, June 1987	- modified for rapid read of database
20 	This is one of three alternative files that can be used to
21 	read a database.  The three files are nxgetaa.c, nmgetaa.c, and
22 	mmgetaa.c.
23 
24 	nxgetaa.c contains the original code for reading databases, and
25 	is still used for Mac and PC versions of fasta33 (which do not
26 	use mmap).
27 
28 	nmgetaa.c and mmgetaa.c are used together.  nmgetaa.c provides
29 	the same functions as nxgetaa.c if memory mapping is not used,
30 	mmgetaa.c provides the database reading functions if memory
31 	mapping is used. The decision to use memory mapping is made on
32 	a file-by-file basis.
33 
34 	June 2, 1987 - added TFASTA
35 	March 30, 1988 - combined ffgetaa, fgetgb;
36 	April 8, 1988 - added PIRLIB format for unix
37 	Feb 4, 1989 - added universal subroutines for libraries
38 	December, 1995 - added range option file.name:1-1000
39 	Feb 22, 2002 - fix to allow "plain" text file queries
40 
41 	getnt.c	associated subroutines for matching sequences */
42 
43 /*  $Id: getseq.c 625 2011-03-23 17:21:38Z wrp $ */
44 /* $Revision: 625 $  */
45 
46 /*
47 	8-April-88
48 	The compile time #define PIRLIB allows this routine to be used
49 	to read protein and DNA sequence libraries in the NBRF/PIR
50 	VAX/VMS library format.  That is:
51 
52 	>P1;LCBO
53 	This is a line of description
54 	GTYH ... the sequence starts on this line
55 
56 	This may ease conversion from UWGCG format libraries. It
57 	has not been extensively tested.
58 
59 	In addition, sequence libraries with a '>' in the 4th position
60 	are recognized as NBRF format libraries for consistency with
61 	UWGCG
62 */
63 
64 /* 	Nov 12, 1987	- this version checks to see if the sequence
65 	is DNA or protein by asking whether > 85% is A, C, G, T
66 
67 	May 5, 1988 - modify the DNA/PROTEIN checker by re-reading
68 	DNA sequences in order to check for 'U'.
69 */
70 
71 #include <stdio.h>
72 #include <stdlib.h>
73 #include <string.h>
74 
75 #include "defs.h"
76 #include "structs.h"
77 
78 #ifndef SFCHAR
79 #define SFCHAR ':'
80 #endif
81 
82 #define XTERNAL
83 #include "uascii.h"
84 #include "upam.h"
85 #undef XTERNAL
86 
87 #define YES 1
88 #define NO 0
89 #define MAXLINE 512
90 
91 #ifndef min
92 #define min(x,y) ((x) > (y) ? (y) : (x))
93 #endif
94 
95 #define NO_FORMAT 0
96 #define FASTA_FORMAT 1
97 #define GCG_FORMAT 2
98 
99 static int seq_format=NO_FORMAT;
100 static char seq_title[200];
101 
102 int scanseq(unsigned char *, int, char *);
103 void sf_sort(int *, int);
104 extern void init_ascii(int is_ext, int *sascii, int is_dna);
105 
106 /* getseq	- get a query sequence, possibly re-reading to set type
107    returns	- length of query sequence or error = 0
108 
109    char *filen	- name of file to be opened
110    char *seq	- destination for query sequence
111    int maxs	- maximum length of query
112    char libstr[20] - short description (locus or acc)
113    int *dnaseq  - -1 => use scanseq to determine sequence type
114    		   0 => must be protein
115 		   1 => must be DNA
116    long *sq0off	- offset into query specified by query_file:1001-2000
117 */
118 
119 int
getseq(char * filen,int * qascii,unsigned char * seq,int maxs,char * libstr,long * sq0off)120 getseq(char *filen, int *qascii, unsigned char *seq, int maxs, char *libstr, long *sq0off)
121 {
122   FILE *fptr;
123   char line[512],*bp, *bp1, *bpn, *tp;
124   int i, rn, n;
125   int ic;
126   int sstart, sstop, sset=0;
127   int llen, l_offset;
128 
129   seq_title[0]='\0';
130   libstr[0]='\0';
131 
132   sstart = sstop = -1;
133 #ifndef DOS
134   if ((bp=strchr(filen,':'))!=NULL && *(bp+1)!='\0') {
135 #else
136   if ((bp=strchr(filen+3,':'))!=NULL && *(bp+1)!='\0') {
137 #endif
138     *bp='\0';
139     if (*(bp+1)=='-') {
140       sstart = 0;
141       sscanf(bp+2,"%d",&sstop);
142     }
143     else {
144       sscanf(bp+1,"%d-%d",&sstart,&sstop);
145       sstart--;
146       if (sstop <= 0 ) sstop = BIGNUM;
147     }
148     sset=1;
149   }
150   else {
151     sstart = 0;
152     sstop = BIGNUM;
153   }
154 
155   /* check for input from stdin */
156   if (strcmp(filen,"-") && strcmp(filen,"@")) {
157     if ((fptr=fopen(filen,"r"))==NULL) {
158       fprintf(stderr," could not open %s\n",filen);
159       return 0;
160     }
161   }
162   else {
163     fptr = stdin;
164   }
165   rn = n=0;
166 
167   while(fgets(line,sizeof(line),fptr)!=NULL) {
168     l_offset = 0;
169     if (line[0]=='>') {
170       seq_format = FASTA_FORMAT;
171       if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
172       strncpy(seq_title,line+1,sizeof(seq_title));
173       seq_title[sizeof(seq_title)-1]='\0';
174       if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
175       strncpy(libstr,line+1,12);
176       libstr[12]='\0';
177     }
178     else if (seq_format==NO_FORMAT && strcmp(line,"..")==0) {
179       seq_format = GCG_FORMAT;
180 /*
181       if (*dnaseq != 1) qascii['*'] = qascii['X'];
182 */
183       l_offset = 10;
184       llen = strlen(line);
185       while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
186 	if (fgets(line,sizeof(line),fptr)==NULL) return 0;
187 	llen = strlen(line);
188       }
189       bp = strtok(line," \t");
190 /*
191       if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
192       else if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
193 */
194       if (bp!=NULL) strncpy(libstr,bp,12);
195       else strncpy(libstr,filen,12);
196       libstr[12]='\0';
197       if (fgets(line,sizeof(line),fptr)==NULL) return 0;
198     }
199     else {
200       if (libstr[0]=='\0') strncpy(libstr,filen,12);
201       libstr[12]='\0';
202     }
203 
204     if (seq_format==GCG_FORMAT && strlen(line)<l_offset) continue;
205 
206     if (line[0]!='>'&& line[0]!=';') {
207       for (i=l_offset; (n<maxs && rn < sstop)&&
208 	     ((ic=qascii[line[i]&AAMASK])<EL); i++)
209 	if (ic<NA && ++rn > sstart) seq[n++]= ic;
210       if (ic == ES || rn > sstop) break;
211     }
212   }
213 
214   if (n==maxs) {
215     fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
216     fflush(stderr);
217   }
218   if ((bp=strchr(libstr,'\n'))!=NULL) *bp = '\0';
219   if ((bp=strchr(libstr,'\r'))!=NULL) *bp = '\0';
220   seq[n]= EOSEQ;
221 
222 
223   if (seq_format !=GCG_FORMAT)
224     while(fgets(line,sizeof(line),fptr)!=NULL) {
225 	if (line[0]!='>'&& line[0]!=';') {
226 	  for (i=0; (n<maxs && rn < sstop)&&
227 		 ((ic=qascii[line[i]&AAMASK])<EL); i++)
228 	    if (ic<NA && ++rn > sstart ) seq[n++]= ic;
229 	  if (ic == ES || rn > sstop) break;
230 	}
231     }
232   else {
233     llen = strlen(line);
234     while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
235       if (fgets(line,sizeof(line),fptr)==NULL) return 0;
236       llen = strlen(line);
237     }
238     while (fgets(line,sizeof(line),fptr)!=NULL) {
239       if (strlen(line)<l_offset) continue;
240       for (i=l_offset; (n<maxs && rn < sstop) &&
241 	     ((ic=qascii[line[i]&AAMASK])<EL); i++)
242 	if (ic<NA && ++rn > sstart ) seq[n++]= ic;
243       if (ic == ES || rn > sstop ) break;
244     }
245   }
246 
247   if (n==maxs) {
248     fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
249     fflush(stderr);
250   }
251   seq[n]= EOSEQ;
252 
253   if (fptr!=stdin) fclose(fptr);
254 
255   if (sset==1) {
256     sstart++;
257     filen[strlen(filen)]=':';
258     if (*sq0off==1 || sstart>=1) *sq0off = sstart;
259   }
260 
261   return n;
262 }
263 
264 int
265 gettitle(char *filen, char *title, int len) {
266   FILE *fptr;
267   char line[512];
268   char *bp;
269   int sset;
270 #ifdef WIN32
271   char *strpbrk();
272 #endif
273 
274   sset = 0;
275 
276   if (strncmp(filen,"-",1)==0 || strncmp(filen,"@",1)==0) {
277     strncpy(title,seq_title,len);
278     title[len-1]='\0';
279     return (int)strlen(title);
280   }
281 
282   if ((bp=strchr(filen,':'))!=NULL) { *bp='\0'; sset=1;}
283 
284 
285   if ((fptr=fopen(filen,"r"))==NULL) {
286     fprintf(stderr," file %s was not found\n",filen);
287     fflush(stderr);
288     return 0;
289   }
290 
291   if (sset==1) filen[strlen(filen)]=':';
292 
293   while(fgets(line,sizeof(line),fptr)!=NULL) {
294     if (line[0]=='>'|| line[0]==';') goto found;
295   }
296   fclose(fptr);
297   title[0]='\0';
298   return 0;
299 
300  found:
301 
302 #ifdef WIN32
303   bp = strpbrk(line,"\n\r");
304 #else
305   bp = strchr(line,'\n');
306 #endif
307   if (bp!=NULL) *bp = 0;
308   strncpy(title,line,len);
309   title[len-1]='\0';
310   fclose(fptr);
311   return strlen(title);
312 }
313 
314