1 /* $Id: llgetaa.c 625 2011-03-23 17:21:38Z wrp $ */
2 
3 /* copyright (c) 1996, 1997, 1998, 1999, 2007 by William R. Pearson and
4    The Rector & Visitors of the University of Virginia */
5 
6 /* Licensed under the Apache License, Version 2.0 (the "License");
7    you may not use this file except in compliance with the License.
8    You may obtain a copy of the License at
9 
10    http://www.apache.org/licenses/LICENSE-2.0
11 
12    Unless required by applicable law or agreed to in writing,
13    software distributed under this License is distributed on an "AS
14    IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
15    express or implied.  See the License for the specific language
16    governing permissions and limitations under the License.
17 */
18 
19 /*
20    Feb, 1998 - version for prss
21 
22    March, 2001 - modifications to support comp_thr.c: use libpos to indicate
23    whether the score is shuffled==1 or unshuffled==0.  This simplifies
24    complib.c and makes comp_thr.c possible
25 
26    modified version of nxgetaa.c that generates random sequences
27    for a library
28 */
29 
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 
34 #include "defs.h"
35 #include "mm_file.h"
36 
37 #include "uascii.h"
38 #include "structs.h"
39 
40 #define XTERNAL
41 #include "upam.h"
42 #undef XTERNAL
43 
44 #define YES 1
45 #define NO 0
46 #define MAXLINE 512
47 
48 #ifndef min
49 #define min(x,y) ((x) > (y) ? (y) : (x))
50 #endif
51 
52 static int use_stdin=0;
53 static char llibstr0[256];
54 static char llibstr1[256];
55 static char o_line[256];
56 
57 #define NO_FORMAT 0
58 #define FASTA_FORMAT 1
59 #define GCG_FORMAT 2
60 static int seq_format=NO_FORMAT;
61 static char seq_title[200];
62 
63 extern int irand(int);
64 extern void shuffle(unsigned char *from, unsigned char *to, int n);
65 extern void wshuffle(unsigned char *from, unsigned char *to, int n, int wsiz, int *ieven);
66 
67 int
getseq(char * filen,int * qascii,unsigned char * seq,int maxs,char * libstr,int n_libstr,long * sq0off)68 getseq(char *filen, int *qascii,
69        unsigned char *seq, int maxs, char *libstr,
70        int n_libstr, long *sq0off)
71 {
72   FILE *fptr;
73   char line[512],*bp;
74   int i, j, n;
75   int ic;
76   int sstart, sstop, sset=0;
77   int have_desc = 0;
78   int desc_complete = 0;
79   int llen, l_offset;
80 
81   seq_title[0]='\0';
82 
83   sstart = sstop = -1;
84 #ifndef DOS
85   if ((bp=strchr(filen,':'))!=NULL) {
86 #else
87   if ((bp=strchr(filen+3,':'))!=NULL) {
88 #endif
89     *bp='\0';
90     if (*(bp+1)=='-') sscanf(bp+2,"%d",&sstop);
91     else sscanf(bp+1,"%d-%d",&sstart,&sstop);
92     sset=1;
93   }
94 
95   if (strcmp(filen,"-") && strcmp(filen,"@")) {
96     if ((fptr=fopen(filen,"r"))==NULL) {
97       fprintf(stderr," could not open %s\n",filen);
98       return 0;
99     }
100   }
101   else {
102     fptr = stdin;
103     use_stdin++;
104   }
105 
106   if (use_stdin > 1) {
107     have_desc = 1;
108     if ((bp=strchr(o_line,'\001'))!=NULL) *bp='\0';
109     strncpy(llibstr1,o_line,sizeof(llibstr1));
110     strncpy(libstr,o_line,n_libstr);
111     libstr[n_libstr-1]='\0';
112     l_offset = 0;
113   }
114 
115   if (sset==1) {
116     filen[strlen(filen)]=':';
117     if (*sq0off==1 || sstart>1) *sq0off = sstart;
118   }
119 
120   desc_complete = 0;
121   n=0;
122   while(fgets(line,sizeof(line),fptr)!=NULL) {
123     if (line[0]=='>') {
124       if (have_desc) {
125 	strncpy(o_line,line,sizeof(o_line));
126 	goto last;
127       }
128       l_offset = 0;
129       seq_format = FASTA_FORMAT;
130 
131       if ((bp=(char *)strchr(line,'\n'))!=NULL) {
132 	*bp='\0';				/* have newline */
133 	desc_complete = 1;
134       }
135 
136       if ((bp=strchr(line+1,'\001'))!=NULL) *bp='\0';
137       strncpy(seq_title,line+1,sizeof(seq_title));
138       strncpy(llibstr0,line+1,sizeof(llibstr0));
139       if (n_libstr <= 20) {
140 	if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
141       }
142       strncpy(libstr,line+1,n_libstr);
143       libstr[n_libstr-1]='\0';
144 
145       if (!desc_complete) {
146 	while (fgets(line, sizeof(line), fptr) != NULL) {
147 	  if (strchr(line,'\n') != NULL) {
148 	    line[0]='>';
149 	    break;
150 	  }
151 	}
152 	desc_complete = 1;
153       }
154     }
155     else if (seq_format==NO_FORMAT) {
156       seq_format = GCG_FORMAT;
157       qascii['*'] = qascii['X'];
158       l_offset = 10;
159       llen = strlen(line);
160       while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
161 	if (fgets(line,sizeof(line),fptr)==NULL) return 0;
162 	llen = strlen(line);
163       }
164       if (n_libstr <= 20) {
165 	if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
166 	else if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
167       }
168       strncpy(libstr,line,n_libstr);
169       libstr[n_libstr-1]='\0';
170       if (fgets(line,sizeof(line),fptr)==NULL) return 0;
171     }
172 
173     if (seq_format==GCG_FORMAT && strlen(line)<l_offset) continue;
174 
175     if (line[0]!='>'&& line[0]!=';') {
176       for (i=l_offset; (n<maxs)&&
177 	     ((ic=qascii[line[i]&AAMASK])<EL); i++)
178 	if (ic<NA) seq[n++]= ic;
179       if (ic == ES) break;
180     }
181     else {
182       if (have_desc) {
183 	strncpy(o_line,line,sizeof(o_line));
184 	goto last;
185       }
186       else {
187 	have_desc = 1;
188       }
189     }
190   }
191 
192  last:
193   if (n==maxs) {
194     fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
195     fflush(stderr);
196   }
197   if ((bp=strchr(libstr,'\n'))!=NULL) *bp = '\0';
198   if ((bp=strchr(libstr,'\r'))!=NULL) *bp = '\0';
199   seq[n]= EOSEQ;
200 
201   if (fptr!=stdin) fclose(fptr);
202 
203   if (sset) {
204     if (sstart <= 0) sstart = 1;
205     if (sstop <= 0) sstop = n;
206     sstart--;
207     sstop--;
208     for (i=0, j=sstart; j<=sstop; i++,j++)
209       seq[i] = seq[j];
210     n = sstop - sstart +1;
211     seq[n]=EOSEQ;
212   }
213 
214   return n;
215 }
216 
217 int
218 gettitle(filen,title,len)
219   char *filen, *title; int len;
220 {
221   FILE *fptr;
222   char line[512];
223   char *bp;
224   int ll,sset;
225 #ifdef WIN32
226   char *strpbrk();
227 #endif
228   sset = 0;
229 
230   if (use_stdin) {
231     if (use_stdin == 1) {
232       /*      use_stdin++; */
233       strncpy(title,llibstr0,len);
234     }
235     else {
236       strncpy(title,llibstr1,len);
237     }
238     if ((bp=strchr(title,'\001'))!=NULL) *bp='\0';
239     return strlen(title);
240   }
241 
242   if ((bp=strchr(filen,':'))!=NULL) { *bp='\0'; sset=1;}
243 
244   if ((fptr=fopen(filen,"r"))==NULL) {
245     fprintf(stderr," file %s was not found\n",filen);
246     fflush(stderr);
247     return 0;
248   }
249 
250   if (sset==1) filen[strlen(filen)]=':';
251 
252   while(fgets(line,sizeof(line),fptr)!=0) {
253     if (line[0]=='>'|| line[0]==';') goto found;
254   }
255   fclose(fptr);
256   title[0]='\0';
257   return 0;
258 
259  found:
260   if ((bp=strchr(line,'\001'))!=NULL) *bp = 0;
261 #ifdef WIN32
262   bp = strpbrk(line,"\n\r");
263 #else
264   bp = strchr(line,'\n');
265 #endif
266   if (bp!=NULL) *bp = 0;
267   strncpy(title,line,len);
268   title[len-1]='\0';
269   fclose(fptr);
270   return strlen(title);
271 }
272 
273 FILE *libf=NULL;
274 
275 long lpos;
276 char lline[MAXLINE];
277 int lfflag=0;	/* flag for CRLF in EMBL CDROM files */
278 #define LFCHAR '\015'  /* for MWC 5.5 */
279 
280 int agetlib(); void aranlib();	/* pearson fasta format */
281 
282 /*	the following is from fgetgb.c */
283 
284 /* a file name for open_lib may now include a library type suffix */
285 /* only opens fasta format files */
286 
287 static char libn_save[MAX_FN];
288 static int ldna_save=0;
289 static int do_shuffle;
290 static int shuff_cnt=10;
291 static int w_flag = 0;
292 #ifdef DEBUG
293 static FILE *dfile=NULL;
294 #endif
295 static unsigned char *aa_save;
296 static int n1_save;
297 static int i_even;
298 
299 /* lmf_str * is used here for compatibility with the "normal" open_lib,
300    but is largely unnecessary */
301 
302 void
303 set_shuffle(struct mngmsg m_msg) {
304   char dfname[MAX_FN];
305 
306   if (m_msg.shuff_wid > 0) w_flag = m_msg.shuff_wid;
307   if (m_msg.shuff_max > shuff_cnt) shuff_cnt = m_msg.shuff_max;
308 
309 #ifdef DEBUG
310   if (m_msg.dfile[0]!='\0') {
311     strncpy(dfname,m_msg.dfile,sizeof(dfname));
312     strncat(dfname,"_rlib",sizeof(dfname));
313     dfile = fopen(dfname,"w");
314   }
315 #endif
316 }
317 
318 struct lmf_str *
319 open_lib(char *lname, int ldnaseq, int *sascii, int quiet, struct lmf_str *m_fd)
320 {
321   char rline[10],libn[MAX_FN], *bp;
322   int wcnt, ll, opnflg;
323   int libtype;
324   struct lmf_str *m_fptr;
325 
326   wcnt = 0;
327   libtype = 0;
328 
329   strncpy(libn_save,lname,sizeof(libn_save));
330 
331   /* now allocate a buffer for the opened text file */
332   if ((m_fptr = calloc(1,sizeof(struct lmf_str)))==NULL) {
333     fprintf(stderr," cannot allocate lmf_str (%ld) for %s\n",
334 	    sizeof(struct lmf_str),lname);
335     return NULL;
336   }
337 
338   strncpy(m_fptr->lb_name,lname,MAX_FN);
339   m_fptr->lb_name[MAX_FN-1]='\0';
340 
341   m_fptr->sascii = sascii;
342   m_fptr->getlib = agetlib;
343   m_fptr->ranlib = aranlib;
344   m_fptr->mm_flg = 0;
345 
346   do_shuffle = 0;
347   irand(0);		/* initialize the random number generator */
348 
349   return m_fptr;
350 }
351 
352 void
353 closelib()
354 {
355   if (libf!=NULL) {
356     fclose(libf);
357     libf = NULL;
358   }
359 #ifdef DEBUG
360   if (dfile) fclose(dfile);
361 #endif
362 }
363 
364 static int ieven=0;
365 static char *desc_save;
366 
367 int
368 agetlib(unsigned char *seq,
369 	int maxs,
370 	char *libstr,
371 	int n_libstr,
372 	fseek_t *libpos,
373 	int *lcont,
374 	struct lmf_str *lf_fd,
375 	long *l_off)
376 {
377   long sq1_off;
378   char lib_desc[120];
379   int i;
380 
381   *l_off = 1;
382 
383   if (!do_shuffle) {
384     do_shuffle = 1;
385 
386     if ((n1_save = getseq(libn_save,lf_fd->sascii,
387 			  seq,maxs,lib_desc,sizeof(lib_desc),&sq1_off)) < 1)
388       return n1_save;
389 
390     strncpy(libstr,lib_desc,n_libstr);
391     libstr[n_libstr-1]='\0';
392 
393     if ((aa_save = (unsigned char *)calloc(n1_save+1,sizeof(unsigned char)))==
394 	NULL) fprintf(stderr," cannot allocate %d for saved sequence\n",
395 		       n1_save);
396     memcpy((void *)aa_save,(void *)seq,n1_save);
397 
398     if ((desc_save =
399 	 (char *)calloc(strlen(lib_desc)+1,sizeof(char)))== NULL) {
400       fprintf(stderr," cannot allocate saved desciption [%d]\n",
401 	      strlen(lib_desc)+1);
402     }
403     else {
404       strncpy (desc_save,lib_desc,strlen(lib_desc));
405       desc_save[strlen(lib_desc)]=='\0';
406     }
407 
408     *libpos = 0;
409     return n1_save;
410   }
411   else {	/* return a shuffled sequence - here we need a window size; */
412     strncpy(libstr,desc_save,n_libstr);
413     libstr[n_libstr-1]='\0';
414 
415     if (shuff_cnt-- <= 0 ) return -1;
416     if (w_flag > 0) wshuffle(aa_save,seq,n1_save,w_flag,&ieven);
417     else shuffle(aa_save,seq,n1_save);
418     seq[n1_save] = EOSEQ;
419 #ifdef DEBUG
420     if (dfile!=NULL) {
421       fprintf(dfile,">%d\n",shuff_cnt);
422       for (i=0; i<n1_save; i++) {
423 	if (aa[seq[i]]>0) fputc(aa[seq[i]],dfile);
424 	else {fprintf(stderr,"error aa0[%d]: %d %d\n",
425 		      i,seq[i],aa[seq[i]]);}
426 	if (i%60 == 59) fputc('\n',dfile);
427       }
428       fputc('\n',dfile);
429     }
430 #endif
431     *libpos = 1;
432     return n1_save;
433   }
434 }
435 
436 void
437 aranlib(char *str,
438 	int cnt,
439 	fseek_t seek,
440 	char *libstr,
441 	struct lmf_str *lm_fd)
442 {
443   char *bp;
444   int ll;
445 
446   if (use_stdin == 2) {
447     if (llibstr1[0]=='>' || llibstr1[0]==';') {
448       strncpy(str,llibstr1+1,cnt);
449     }
450     else {
451       strncpy(str,llibstr1,cnt);
452     }
453   }
454   else {
455     strncpy(str,desc_save,cnt);
456   }
457   str[cnt-1]='\0';
458   if ((bp = strchr(str,'\001'))!=NULL) *bp='\0';
459   else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
460   else str[cnt-1]='\0';
461 }
462 
463 /*
464 void
465 revcomp(unsigned char *seq, int n, int *c_nt)
466 {
467   unsigned char tmp;
468   int i, ni;
469 
470 
471   for (i=0, ni = n-1; i< n/2; i++,ni--) {
472     tmp = c_nt[seq[i]];
473     seq[i] = c_nt[seq[ni]];
474     seq[ni] = tmp;
475   }
476   if ((n%2)==1) {
477     i = n/2;
478     seq[i] = c_nt[seq[i]];
479   }
480 }
481 */
482 
483 struct lmf_str *
484 re_openlib(struct lmf_str *om_fptr, int outtty)
485 {
486   return om_fptr;
487 }
488 
489 int re_getlib(unsigned char *aa1, int n1, int maxt3, int loff, int cont,
490 	      int term_code, long *loffset, long *l_off,
491 	      struct lmf_str *m_file_p)
492 {
493   *loffset = 0;
494   *l_off = 1;
495   return n1;
496 }
497 
498