1 /* $Id: llgetaa.c 625 2011-03-23 17:21:38Z wrp $ */
2
3 /* copyright (c) 1996, 1997, 1998, 1999, 2007 by William R. Pearson and
4 The Rector & Visitors of the University of Virginia */
5
6 /* Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing,
13 software distributed under this License is distributed on an "AS
14 IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
15 express or implied. See the License for the specific language
16 governing permissions and limitations under the License.
17 */
18
19 /*
20 Feb, 1998 - version for prss
21
22 March, 2001 - modifications to support comp_thr.c: use libpos to indicate
23 whether the score is shuffled==1 or unshuffled==0. This simplifies
24 complib.c and makes comp_thr.c possible
25
26 modified version of nxgetaa.c that generates random sequences
27 for a library
28 */
29
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33
34 #include "defs.h"
35 #include "mm_file.h"
36
37 #include "uascii.h"
38 #include "structs.h"
39
40 #define XTERNAL
41 #include "upam.h"
42 #undef XTERNAL
43
44 #define YES 1
45 #define NO 0
46 #define MAXLINE 512
47
48 #ifndef min
49 #define min(x,y) ((x) > (y) ? (y) : (x))
50 #endif
51
52 static int use_stdin=0;
53 static char llibstr0[256];
54 static char llibstr1[256];
55 static char o_line[256];
56
57 #define NO_FORMAT 0
58 #define FASTA_FORMAT 1
59 #define GCG_FORMAT 2
60 static int seq_format=NO_FORMAT;
61 static char seq_title[200];
62
63 extern int irand(int);
64 extern void shuffle(unsigned char *from, unsigned char *to, int n);
65 extern void wshuffle(unsigned char *from, unsigned char *to, int n, int wsiz, int *ieven);
66
67 int
getseq(char * filen,int * qascii,unsigned char * seq,int maxs,char * libstr,int n_libstr,long * sq0off)68 getseq(char *filen, int *qascii,
69 unsigned char *seq, int maxs, char *libstr,
70 int n_libstr, long *sq0off)
71 {
72 FILE *fptr;
73 char line[512],*bp;
74 int i, j, n;
75 int ic;
76 int sstart, sstop, sset=0;
77 int have_desc = 0;
78 int desc_complete = 0;
79 int llen, l_offset;
80
81 seq_title[0]='\0';
82
83 sstart = sstop = -1;
84 #ifndef DOS
85 if ((bp=strchr(filen,':'))!=NULL) {
86 #else
87 if ((bp=strchr(filen+3,':'))!=NULL) {
88 #endif
89 *bp='\0';
90 if (*(bp+1)=='-') sscanf(bp+2,"%d",&sstop);
91 else sscanf(bp+1,"%d-%d",&sstart,&sstop);
92 sset=1;
93 }
94
95 if (strcmp(filen,"-") && strcmp(filen,"@")) {
96 if ((fptr=fopen(filen,"r"))==NULL) {
97 fprintf(stderr," could not open %s\n",filen);
98 return 0;
99 }
100 }
101 else {
102 fptr = stdin;
103 use_stdin++;
104 }
105
106 if (use_stdin > 1) {
107 have_desc = 1;
108 if ((bp=strchr(o_line,'\001'))!=NULL) *bp='\0';
109 strncpy(llibstr1,o_line,sizeof(llibstr1));
110 strncpy(libstr,o_line,n_libstr);
111 libstr[n_libstr-1]='\0';
112 l_offset = 0;
113 }
114
115 if (sset==1) {
116 filen[strlen(filen)]=':';
117 if (*sq0off==1 || sstart>1) *sq0off = sstart;
118 }
119
120 desc_complete = 0;
121 n=0;
122 while(fgets(line,sizeof(line),fptr)!=NULL) {
123 if (line[0]=='>') {
124 if (have_desc) {
125 strncpy(o_line,line,sizeof(o_line));
126 goto last;
127 }
128 l_offset = 0;
129 seq_format = FASTA_FORMAT;
130
131 if ((bp=(char *)strchr(line,'\n'))!=NULL) {
132 *bp='\0'; /* have newline */
133 desc_complete = 1;
134 }
135
136 if ((bp=strchr(line+1,'\001'))!=NULL) *bp='\0';
137 strncpy(seq_title,line+1,sizeof(seq_title));
138 strncpy(llibstr0,line+1,sizeof(llibstr0));
139 if (n_libstr <= 20) {
140 if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
141 }
142 strncpy(libstr,line+1,n_libstr);
143 libstr[n_libstr-1]='\0';
144
145 if (!desc_complete) {
146 while (fgets(line, sizeof(line), fptr) != NULL) {
147 if (strchr(line,'\n') != NULL) {
148 line[0]='>';
149 break;
150 }
151 }
152 desc_complete = 1;
153 }
154 }
155 else if (seq_format==NO_FORMAT) {
156 seq_format = GCG_FORMAT;
157 qascii['*'] = qascii['X'];
158 l_offset = 10;
159 llen = strlen(line);
160 while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
161 if (fgets(line,sizeof(line),fptr)==NULL) return 0;
162 llen = strlen(line);
163 }
164 if (n_libstr <= 20) {
165 if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
166 else if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
167 }
168 strncpy(libstr,line,n_libstr);
169 libstr[n_libstr-1]='\0';
170 if (fgets(line,sizeof(line),fptr)==NULL) return 0;
171 }
172
173 if (seq_format==GCG_FORMAT && strlen(line)<l_offset) continue;
174
175 if (line[0]!='>'&& line[0]!=';') {
176 for (i=l_offset; (n<maxs)&&
177 ((ic=qascii[line[i]&AAMASK])<EL); i++)
178 if (ic<NA) seq[n++]= ic;
179 if (ic == ES) break;
180 }
181 else {
182 if (have_desc) {
183 strncpy(o_line,line,sizeof(o_line));
184 goto last;
185 }
186 else {
187 have_desc = 1;
188 }
189 }
190 }
191
192 last:
193 if (n==maxs) {
194 fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
195 fflush(stderr);
196 }
197 if ((bp=strchr(libstr,'\n'))!=NULL) *bp = '\0';
198 if ((bp=strchr(libstr,'\r'))!=NULL) *bp = '\0';
199 seq[n]= EOSEQ;
200
201 if (fptr!=stdin) fclose(fptr);
202
203 if (sset) {
204 if (sstart <= 0) sstart = 1;
205 if (sstop <= 0) sstop = n;
206 sstart--;
207 sstop--;
208 for (i=0, j=sstart; j<=sstop; i++,j++)
209 seq[i] = seq[j];
210 n = sstop - sstart +1;
211 seq[n]=EOSEQ;
212 }
213
214 return n;
215 }
216
217 int
218 gettitle(filen,title,len)
219 char *filen, *title; int len;
220 {
221 FILE *fptr;
222 char line[512];
223 char *bp;
224 int ll,sset;
225 #ifdef WIN32
226 char *strpbrk();
227 #endif
228 sset = 0;
229
230 if (use_stdin) {
231 if (use_stdin == 1) {
232 /* use_stdin++; */
233 strncpy(title,llibstr0,len);
234 }
235 else {
236 strncpy(title,llibstr1,len);
237 }
238 if ((bp=strchr(title,'\001'))!=NULL) *bp='\0';
239 return strlen(title);
240 }
241
242 if ((bp=strchr(filen,':'))!=NULL) { *bp='\0'; sset=1;}
243
244 if ((fptr=fopen(filen,"r"))==NULL) {
245 fprintf(stderr," file %s was not found\n",filen);
246 fflush(stderr);
247 return 0;
248 }
249
250 if (sset==1) filen[strlen(filen)]=':';
251
252 while(fgets(line,sizeof(line),fptr)!=0) {
253 if (line[0]=='>'|| line[0]==';') goto found;
254 }
255 fclose(fptr);
256 title[0]='\0';
257 return 0;
258
259 found:
260 if ((bp=strchr(line,'\001'))!=NULL) *bp = 0;
261 #ifdef WIN32
262 bp = strpbrk(line,"\n\r");
263 #else
264 bp = strchr(line,'\n');
265 #endif
266 if (bp!=NULL) *bp = 0;
267 strncpy(title,line,len);
268 title[len-1]='\0';
269 fclose(fptr);
270 return strlen(title);
271 }
272
273 FILE *libf=NULL;
274
275 long lpos;
276 char lline[MAXLINE];
277 int lfflag=0; /* flag for CRLF in EMBL CDROM files */
278 #define LFCHAR '\015' /* for MWC 5.5 */
279
280 int agetlib(); void aranlib(); /* pearson fasta format */
281
282 /* the following is from fgetgb.c */
283
284 /* a file name for open_lib may now include a library type suffix */
285 /* only opens fasta format files */
286
287 static char libn_save[MAX_FN];
288 static int ldna_save=0;
289 static int do_shuffle;
290 static int shuff_cnt=10;
291 static int w_flag = 0;
292 #ifdef DEBUG
293 static FILE *dfile=NULL;
294 #endif
295 static unsigned char *aa_save;
296 static int n1_save;
297 static int i_even;
298
299 /* lmf_str * is used here for compatibility with the "normal" open_lib,
300 but is largely unnecessary */
301
302 void
303 set_shuffle(struct mngmsg m_msg) {
304 char dfname[MAX_FN];
305
306 if (m_msg.shuff_wid > 0) w_flag = m_msg.shuff_wid;
307 if (m_msg.shuff_max > shuff_cnt) shuff_cnt = m_msg.shuff_max;
308
309 #ifdef DEBUG
310 if (m_msg.dfile[0]!='\0') {
311 strncpy(dfname,m_msg.dfile,sizeof(dfname));
312 strncat(dfname,"_rlib",sizeof(dfname));
313 dfile = fopen(dfname,"w");
314 }
315 #endif
316 }
317
318 struct lmf_str *
319 open_lib(char *lname, int ldnaseq, int *sascii, int quiet, struct lmf_str *m_fd)
320 {
321 char rline[10],libn[MAX_FN], *bp;
322 int wcnt, ll, opnflg;
323 int libtype;
324 struct lmf_str *m_fptr;
325
326 wcnt = 0;
327 libtype = 0;
328
329 strncpy(libn_save,lname,sizeof(libn_save));
330
331 /* now allocate a buffer for the opened text file */
332 if ((m_fptr = calloc(1,sizeof(struct lmf_str)))==NULL) {
333 fprintf(stderr," cannot allocate lmf_str (%ld) for %s\n",
334 sizeof(struct lmf_str),lname);
335 return NULL;
336 }
337
338 strncpy(m_fptr->lb_name,lname,MAX_FN);
339 m_fptr->lb_name[MAX_FN-1]='\0';
340
341 m_fptr->sascii = sascii;
342 m_fptr->getlib = agetlib;
343 m_fptr->ranlib = aranlib;
344 m_fptr->mm_flg = 0;
345
346 do_shuffle = 0;
347 irand(0); /* initialize the random number generator */
348
349 return m_fptr;
350 }
351
352 void
353 closelib()
354 {
355 if (libf!=NULL) {
356 fclose(libf);
357 libf = NULL;
358 }
359 #ifdef DEBUG
360 if (dfile) fclose(dfile);
361 #endif
362 }
363
364 static int ieven=0;
365 static char *desc_save;
366
367 int
368 agetlib(unsigned char *seq,
369 int maxs,
370 char *libstr,
371 int n_libstr,
372 fseek_t *libpos,
373 int *lcont,
374 struct lmf_str *lf_fd,
375 long *l_off)
376 {
377 long sq1_off;
378 char lib_desc[120];
379 int i;
380
381 *l_off = 1;
382
383 if (!do_shuffle) {
384 do_shuffle = 1;
385
386 if ((n1_save = getseq(libn_save,lf_fd->sascii,
387 seq,maxs,lib_desc,sizeof(lib_desc),&sq1_off)) < 1)
388 return n1_save;
389
390 strncpy(libstr,lib_desc,n_libstr);
391 libstr[n_libstr-1]='\0';
392
393 if ((aa_save = (unsigned char *)calloc(n1_save+1,sizeof(unsigned char)))==
394 NULL) fprintf(stderr," cannot allocate %d for saved sequence\n",
395 n1_save);
396 memcpy((void *)aa_save,(void *)seq,n1_save);
397
398 if ((desc_save =
399 (char *)calloc(strlen(lib_desc)+1,sizeof(char)))== NULL) {
400 fprintf(stderr," cannot allocate saved desciption [%d]\n",
401 strlen(lib_desc)+1);
402 }
403 else {
404 strncpy (desc_save,lib_desc,strlen(lib_desc));
405 desc_save[strlen(lib_desc)]=='\0';
406 }
407
408 *libpos = 0;
409 return n1_save;
410 }
411 else { /* return a shuffled sequence - here we need a window size; */
412 strncpy(libstr,desc_save,n_libstr);
413 libstr[n_libstr-1]='\0';
414
415 if (shuff_cnt-- <= 0 ) return -1;
416 if (w_flag > 0) wshuffle(aa_save,seq,n1_save,w_flag,&ieven);
417 else shuffle(aa_save,seq,n1_save);
418 seq[n1_save] = EOSEQ;
419 #ifdef DEBUG
420 if (dfile!=NULL) {
421 fprintf(dfile,">%d\n",shuff_cnt);
422 for (i=0; i<n1_save; i++) {
423 if (aa[seq[i]]>0) fputc(aa[seq[i]],dfile);
424 else {fprintf(stderr,"error aa0[%d]: %d %d\n",
425 i,seq[i],aa[seq[i]]);}
426 if (i%60 == 59) fputc('\n',dfile);
427 }
428 fputc('\n',dfile);
429 }
430 #endif
431 *libpos = 1;
432 return n1_save;
433 }
434 }
435
436 void
437 aranlib(char *str,
438 int cnt,
439 fseek_t seek,
440 char *libstr,
441 struct lmf_str *lm_fd)
442 {
443 char *bp;
444 int ll;
445
446 if (use_stdin == 2) {
447 if (llibstr1[0]=='>' || llibstr1[0]==';') {
448 strncpy(str,llibstr1+1,cnt);
449 }
450 else {
451 strncpy(str,llibstr1,cnt);
452 }
453 }
454 else {
455 strncpy(str,desc_save,cnt);
456 }
457 str[cnt-1]='\0';
458 if ((bp = strchr(str,'\001'))!=NULL) *bp='\0';
459 else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
460 else str[cnt-1]='\0';
461 }
462
463 /*
464 void
465 revcomp(unsigned char *seq, int n, int *c_nt)
466 {
467 unsigned char tmp;
468 int i, ni;
469
470
471 for (i=0, ni = n-1; i< n/2; i++,ni--) {
472 tmp = c_nt[seq[i]];
473 seq[i] = c_nt[seq[ni]];
474 seq[ni] = tmp;
475 }
476 if ((n%2)==1) {
477 i = n/2;
478 seq[i] = c_nt[seq[i]];
479 }
480 }
481 */
482
483 struct lmf_str *
484 re_openlib(struct lmf_str *om_fptr, int outtty)
485 {
486 return om_fptr;
487 }
488
489 int re_getlib(unsigned char *aa1, int n1, int maxt3, int loff, int cont,
490 int term_code, long *loffset, long *l_off,
491 struct lmf_str *m_file_p)
492 {
493 *loffset = 0;
494 *l_off = 1;
495 return n1;
496 }
497
498