1 /* @source diffseq application
2 **
3 ** Find differences (SNPs) between nearly identical sequences
4 **
5 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 #include "emboss.h"
24
25
26
27
28 /* @datastatic CdsPval ****************************************************
29 **
30 ** CDS information object
31 **
32 ** @alias CdsSval
33 ** @alias CdsOval
34 **
35 ** @attr Start [ajint] Start of CDS (always less than End)
36 ** @attr End [ajint] End of CDS (always greater than Start)
37 ** @attr Phase [ajint] Phase of translation (0,1 or 2)
38 ** @attr Parent [AjBool] ajTrue is this CDS is a parent of a forward sense join
39 ** or the last CDS of a reverse sense join
40 ** @attr Single [AjBool] ajTrue is this CDS is a not member of a join()
41 ** @attr ReverseParent [AjBool] ajTrue is this CDS is a parent of rev sense join
42 ** or the last CDS in a forward sense join
43 ** @attr Local [AjBool] ajTrue is this CDS is local
44 ** @attr Sense [char] Sense '+' or '-'
45 ** @attr Padding [char[3]] Padding to alignment boundary
46 ** @@
47 ******************************************************************************/
48
49 typedef struct CdsSval
50 {
51 ajint Start;
52 ajint End;
53 ajint Phase;
54 AjBool Parent;
55 AjBool Single;
56 AjBool ReverseParent;
57 AjBool Local;
58 char Sense;
59 char Padding[3];
60 } CdsOval;
61 #define CdsPval CdsOval*
62
63
64
65
66 /* @datastatic PosPDiff ****************************************************
67 **
68 ** Position of a difference between two matching regions
69 ** If there is something inserted in one sequence that does not
70 ** occur in the other, then the other has Start after the gap and
71 ** End before the gap and Len=0.
72 **
73 ** @alias PosSDiff
74 ** @alias PosODiff
75 **
76 ** @attr Start1 [ajint] Start of difference in sequence 1 (End1+1 if gap)
77 ** @attr Start2 [ajint] Start of difference in sequence 2 (End1+1 if gap)
78 ** @attr End1 [ajint] End of difference in seq 1
79 ** @attr End2 [ajint] End of difference in seq 2
80 ** @attr Len1 [ajint] Length of difference in sequence 1 (0 = a gap)
81 ** @attr Len2 [ajint] Length of difference in sequence 2 (0 = a gap)
82 ** @attr Rev1 [AjBool] Reverse sequence 1
83 ** @attr Rev2 [AjBool] Reverse sequence 2
84 ** @@
85 ******************************************************************************/
86
87 typedef struct PosSDiff
88 {
89 ajint Start1;
90 ajint Start2;
91 ajint End1;
92 ajint End2;
93 ajint Len1;
94 ajint Len2;
95 AjBool Rev1;
96 AjBool Rev2;
97 } PosODiff;
98 #define PosPDiff PosODiff*
99
100
101 static void diffseq_Diff(const AjPList difflist,
102 const AjPSeq seq1, const AjPSeq seq2,
103 AjPReport report, AjPFeattable ftab,
104 ajint over1start, ajint over1end,
105 ajint over2start, ajint over2end);
106
107 static void diffseq_WordMatchListConvDiffToFeat(const AjPList list,
108 AjPFeattable *tab1,
109 AjPFeattable *tab2,
110 const AjPSeq seq1,
111 const AjPSeq seq2);
112
113 static void diffseq_Features(const char* typefeat, AjPFeature rf,
114 const AjPFeattable feat,
115 ajuint start, ajuint end);
116
117 static void diffseq_AddTags(AjPStr* strval, const AjPFeature feat,
118 AjBool values);
119
120 static void diffseq_DiffList(const AjPList matchlist, AjPList difflist,
121 AjBool global, const AjPSeq seq1,
122 const AjPSeq seq2, ajint *over1start,
123 ajint *over1end, ajint *over2start,
124 ajint *over2end);
125
126 static PosPDiff diffseq_PosPDiffNew(void);
127
128 static void diffseq_PosPDiffDel(void **x, void *cl);
129
130 static void diffseq_FeatSetCDSFrame(AjPFeattable ftab);
131
132
133
134
135 /* @prog diffseq **************************************************************
136 **
137 ** Find differences (SNPs) between nearly identical sequences
138 **
139 ******************************************************************************/
140
main(int argc,char ** argv)141 int main(int argc, char **argv)
142 {
143 AjPSeq seq1;
144 AjPSeq seq2;
145 ajint wordlen;
146 AjPTable seq1MatchTable = 0;
147 AjPList matchlist = NULL;
148 AjPList difflist = NULL;
149 AjPReport report;
150 AjPFeattable Tab1 = NULL;
151 AjPFeattable Tab2 = NULL;
152 AjPFeattable TabRpt = NULL;
153 AjPFeattabOut seq1out = NULL;
154 AjPFeattabOut seq2out = NULL;
155 AjPStr tmpstr=NULL;
156 AjBool global; /* true if want to report the differences at the ends */
157 ajint over1start; /* start and end positions of match overlap */
158 ajint over1end; /* ... for sequence 1 */
159 ajint over2start = 0; /* overlap for sequence 2 */
160 ajint over2end;
161
162
163 embInit("diffseq", argc, argv);
164
165 wordlen = ajAcdGetInt("wordsize");
166 seq1 = ajAcdGetSeq("asequence");
167 seq2 = ajAcdGetSeq("bsequence");
168 report = ajAcdGetReport("outfile");
169 seq1out = ajAcdGetFeatout("aoutfeat");
170 seq2out = ajAcdGetFeatout("boutfeat");
171
172 /* advanced qualifiers */
173 global = ajAcdGetBoolean("globaldifferences");
174
175 ajSeqTrim(seq1);
176 ajSeqTrim(seq2);
177
178 TabRpt = ajFeattableNewSeq(seq1);
179
180 embWordLength(wordlen);
181 if(embWordGetTable(&seq1MatchTable, seq1))
182 /* get table of words */
183 matchlist = embWordBuildMatchTable(seq1MatchTable, seq2, ajTrue);
184
185
186 /* get the minimal set of overlapping matches */
187 if(matchlist)
188 embWordMatchMin(matchlist);
189
190
191 if(matchlist)
192 {
193 /* convert the list of matches to a list of differences */
194 difflist = ajListNew();
195 diffseq_DiffList(matchlist, difflist, global, seq1, seq2,
196 &over1start, &over1end, &over2start, &over2end);
197
198
199 /* output the gff files */
200 diffseq_WordMatchListConvDiffToFeat(difflist, &Tab1, &Tab2,
201 seq1, seq2);
202
203
204 diffseq_Diff(difflist, seq1, seq2, report, TabRpt,
205 over1start, over1end, over2start, over2end);
206
207
208 embWordMatchListDelete(&matchlist); /* free the match structures */
209
210 /* delete difflist */
211 ajListMap(difflist, &diffseq_PosPDiffDel, NULL);
212 ajListFree(&difflist);
213
214 }
215 ajReportSetStatistics(report, 2,
216 ajSeqGetLenTrimmed(seq1)+ajSeqGetLenTrimmed(seq2));
217
218 ajFeattableWrite(seq1out, Tab1);
219 ajFeattableWrite(seq2out, Tab2);
220
221 tmpstr = ajStrNew();
222 ajFmtPrintS(&tmpstr, "Feature file for first sequence");
223 ajReportAddFileF(report, ajFeattabOutFile(seq1out), tmpstr);
224
225 ajFmtPrintS(&tmpstr, "Feature file for second sequence");
226 ajReportAddFileF(report, ajFeattabOutFile(seq2out), tmpstr);
227
228 ajReportWrite(report, TabRpt, seq1);
229 ajReportClose(report);
230
231 /* tidy up */
232 ajSeqDel(&seq1);
233 ajSeqDel(&seq2);
234
235 embWordFreeTable(&seq1MatchTable);
236
237 ajReportDel(&report);
238
239 ajListFree(&matchlist);
240 ajListFree(&difflist);
241
242 ajFeattableDel(&Tab1);
243 ajFeattableDel(&Tab2);
244 ajFeattableDel(&TabRpt);
245 ajFeattabOutDel(&seq1out);
246 ajFeattabOutDel(&seq2out);
247
248 ajStrDel(&tmpstr);
249
250
251 embExit();
252 return 0;
253 }
254
255
256
257
258 /* @funcstatic diffseq_Diff ************************************************
259 **
260 ** Do a diff and build a report on the diff of the two sequences.
261 **
262 ** @param [r] difflist [const AjPList] List of differences
263 ** @param [r] seq1 [const AjPSeq] Sequence to be diff'd.
264 ** @param [r] seq2 [const AjPSeq] Sequence to be diff'd.
265 ** @param [u] report [AjPReport] Report object.
266 ** @param [u] ftab [AjPFeattable] Report feature table
267 ** @param [r] over1start [ajint] start of overlap region in sequence1
268 ** @param [r] over1end [ajint] end of overlap region in sequence1
269 ** @param [r] over2start [ajint] start of overlap region in sequence2
270 ** @param [r] over2end [ajint] end of overlap region in sequence2
271 ** @return [void]
272 ** @@
273 ******************************************************************************/
274
diffseq_Diff(const AjPList difflist,const AjPSeq seq1,const AjPSeq seq2,AjPReport report,AjPFeattable ftab,ajint over1start,ajint over1end,ajint over2start,ajint over2end)275 static void diffseq_Diff(const AjPList difflist,
276 const AjPSeq seq1, const AjPSeq seq2,
277 AjPReport report, AjPFeattable ftab,
278 ajint over1start, ajint over1end,
279 ajint over2start, ajint over2end)
280 {
281 AjIList iter = NULL; /* match list iterator */
282 PosPDiff diff = NULL; /* difference structure */
283 const AjPStr s1; /* string of seq1 */
284 const AjPStr s2; /* string of seq2 */
285 AjPStr tmp; /* temporary string */
286
287 /* stuff for counting SNPs, transitions & transversions */
288 ajint snps = 0;
289 ajint transitions = 0;
290 ajint transversions = 0; /* counts of SNP types */
291 char base1 = '\0';
292 char base2 = '\0';
293 AjPStr tmpstr = NULL;
294 static AjPStr tmpseq = NULL;
295
296 AjPFeattable feat1 = NULL;
297 AjPFeattable feat2 = NULL;
298
299 AjPFeature gf = NULL;
300 ajuint len1 = 0;
301 ajuint len2 = 0;
302
303 tmpstr = ajStrNew();
304 tmp = ajStrNew();
305
306 len1 = ajSeqGetLen(seq1);
307 len2 = ajSeqGetLen(seq2);
308
309 s1 = ajSeqGetSeqS(seq1);
310 s2 = ajSeqGetSeqS(seq2);
311
312 feat1 = ajSeqGetFeatCopy(seq1);
313 feat2 = ajSeqGetFeatCopy(seq2);
314
315 /*
316 ** Fix the Frame field in the CDS features of the feature table for
317 ** seq1. These are often left as 'unsure' by the ajfeat routines.
318 */
319 diffseq_FeatSetCDSFrame(feat1);
320
321 /* create report header */
322 ajFmtPrintS(&tmpstr, "Compare: %S from: %d to: %d\n\n",
323 ajReportGetSeqnameSeq(report, seq2),
324 ajSeqGetBegin(seq2), ajSeqGetEnd(seq2));
325 if (over1start != -1)
326 {
327 ajFmtPrintAppS(&tmpstr, "%S overlap starts at %d\n",
328 ajReportGetSeqnameSeq(report, seq1),
329 over1start);
330 ajFmtPrintAppS(&tmpstr, "%S overlap starts at %d\n\n",
331 ajReportGetSeqnameSeq(report, seq2),
332 over2start);
333 }
334 ajReportSetHeaderS(report, tmpstr);
335
336
337 iter = ajListIterNewread(difflist);
338 while(!ajListIterDone(iter))
339 {
340 diff = (PosPDiff) ajListIterGet(iter) ;
341
342 /* seq 1 details */
343 if(diff->Len1 > 0)
344 {
345 if(diff->Rev1)
346 {
347 gf = ajFeatNewIIRev(ftab,
348 (len1 - diff->End1 + 1),
349 (len1 - diff->Start1 + 1));
350 }
351 else
352 {
353 gf = ajFeatNewII(ftab, diff->Start1, diff->End1);
354 }
355
356 ajStrAssignSubS(&tmp, s1, diff->Start1-1, diff->End1-1);
357 base1 = * ajStrGetPtr(tmp);
358 }
359 else
360 {
361 gf = ajFeatNewBetween(ftab, diff->Start1);
362 ajStrAssignC(&tmp, "");
363 }
364 diffseq_Features("first_feature", gf,
365 feat1, diff->Start1, diff->End1);
366
367 /* seq2 details */
368 diffseq_Features("second_feature", gf,
369 feat2, diff->Start2, diff->End2);
370
371 ajFmtPrintS(&tmp, "*name %S", ajReportGetSeqnameSeq(report, seq2));
372 ajFeatTagAddSS(gf, NULL, tmp);
373
374 ajFmtPrintS(&tmp, "*length %d", diff->Len2);
375 ajFeatTagAddSS(gf, NULL, tmp);
376
377 if(diff->Rev2)
378 {
379 ajFmtPrintS(&tmp, "*strand -");
380 ajFeatTagAddSS(gf, NULL, tmp);
381 }
382
383 if(diff->Len2 > 0)
384 {
385 if(diff->Rev2)
386 {
387 ajFmtPrintS(&tmp, "*start %d", (len2 - diff->End2 + 1));
388 ajFeatTagAddSS(gf, NULL, tmp);
389 ajFmtPrintS(&tmp, "*end %d", (len2 - diff->Start2 + 1));
390 ajFeatTagAddSS(gf, NULL, tmp);
391 }
392 else
393 {
394 ajFmtPrintS(&tmp, "*start %d", diff->Start2);
395 ajFeatTagAddSS(gf, NULL, tmp);
396 ajFmtPrintS(&tmp, "*end %d", diff->End2);
397 ajFeatTagAddSS(gf, NULL, tmp);
398 }
399
400 ajStrAssignSubS(&tmpseq, s2, diff->Start2-1, diff->End2-1);
401 ajFmtPrintS(&tmp, "*sequence %S", tmpseq);
402 ajFeatTagAddSS(gf, NULL, tmp);
403 base2 = * ajStrGetPtr(tmpseq);
404 }
405 else
406 {
407 if(diff->Rev2)
408 {
409 ajFmtPrintS(&tmp, "*start %d", (len2 - diff->Start2 + 1));
410 ajFeatTagAddSS(gf, NULL, tmp);
411 ajFmtPrintS(&tmp, "*end %d", (len2 - diff->Start2 + 1));
412 ajFeatTagAddSS(gf, NULL, tmp);
413 }
414 else
415 {
416 ajFmtPrintS(&tmp, "*start %d", diff->End2);
417 ajFeatTagAddSS(gf, NULL, tmp);
418 ajFmtPrintS(&tmp, "*end %d", diff->End2);
419 ajFeatTagAddSS(gf, NULL, tmp);
420 }
421 }
422
423 /* count SNPs, transitions & transversions */
424 if(diff->Len1 == 1 && diff->Len2 == 1)
425 {
426 snps++;
427 transitions += (ajint) embPropTransition(base1, base2);
428 transversions += (ajint) embPropTransversion(base1, base2);
429 }
430
431 }
432
433 ajListIterDel(&iter);
434
435 /* create report tail */
436 if(over1start != -1)
437 {
438 ajFmtPrintS(&tmp, "Overlap_end: %d in %S\n",
439 over1end,
440 ajReportGetSeqnameSeq(report, seq1));
441 ajFmtPrintAppS(&tmp, "Overlap_end: %d in %S\n",
442 over2end,
443 ajReportGetSeqnameSeq(report, seq2));
444 if(ajSeqIsNuc(seq1))
445 {
446 ajFmtPrintAppS(&tmp, "\n");
447 ajFmtPrintAppS(&tmp, "SNP_count: %d\n", snps);
448 ajFmtPrintAppS(&tmp, "Transitions: %d\n", transitions);
449 ajFmtPrintAppS(&tmp, "Transversions: %d\n", transversions);
450 }
451 }
452 else
453 /* no iterations of the match list done - ie no matches */
454 ajFmtPrintS(&tmp, "No regions of alignment found.\n");
455
456 ajReportSetTailS(report, tmp);
457
458
459 ajStrDel(&tmp);
460 ajFeattableDel(&feat1);
461 ajFeattableDel(&feat2);
462
463 ajStrDel(&tmpstr);
464 ajStrDel(&tmpseq);
465
466 return;
467 }
468
469
470
471
472 /* @funcstatic diffseq_WordMatchListConvDiffToFeat ****************************
473 **
474 ** Convert the differences list to feature tables for output.
475 **
476 ** @param [r] list [const AjPList] differences list
477 ** @param [u] tab1 [AjPFeattable*] feature table for sequence 1
478 ** @param [u] tab2 [AjPFeattable*] feature table for sequence 2
479 ** @param [r] seq1 [const AjPSeq] sequence 1
480 ** @param [r] seq2 [const AjPSeq] sequence 2
481 ** @return [void]
482 ** @@
483 ******************************************************************************/
484
diffseq_WordMatchListConvDiffToFeat(const AjPList list,AjPFeattable * tab1,AjPFeattable * tab2,const AjPSeq seq1,const AjPSeq seq2)485 static void diffseq_WordMatchListConvDiffToFeat(const AjPList list,
486 AjPFeattable *tab1,
487 AjPFeattable *tab2,
488 const AjPSeq seq1,
489 const AjPSeq seq2)
490 {
491 ajint frame = 0;
492 AjPStr source = NULL;
493 AjPStr type = NULL;
494 AjPStr note = NULL;
495 AjPStr replace = NULL;
496 AjPFeature feature = NULL;
497 AjIList iter = NULL;
498 AjPStr notestr = NULL;
499 AjPStr replacestr = NULL;
500 AjPStr sourcestr = NULL;
501 AjPStr conflictstr = NULL;
502 float score = 0.0;
503 ajuint len1 = 0;
504 ajuint len2 = 0;
505
506 if(!*tab1)
507 *tab1 = ajFeattableNewSeq(seq1);
508
509 if(!*tab2)
510 *tab2 = ajFeattableNewSeq(seq2);
511
512 source = ajStrNew();
513 type = ajStrNew();
514 note = ajStrNew();
515 replace = ajStrNew();
516 replacestr = ajStrNew();
517 notestr = ajStrNew();
518
519 ajStrAssignC(&source,"diffseq");
520 ajStrAssignC(&type,"conflict");
521 ajStrAssignC(¬e,"note");
522 ajStrAssignC(&replace,"replace");
523 score = 1.0;
524
525 len1 = ajSeqGetLen(seq1);
526 len2 = ajSeqGetLen(seq2);
527
528 iter = ajListIterNewread(list);
529 while(!ajListIterDone(iter))
530 {
531 PosPDiff diff = (PosPDiff) ajListIterGet(iter);
532
533 if(diff->Len1)
534 { /* is there a gap between the matches? */
535 if(diff->Rev1)
536 feature = ajFeatNew(*tab1, source, type,
537 len1 - diff->Start1 + 1,
538 len1 - diff->End1 + 1,
539 score, '-', frame);
540 else
541 feature = ajFeatNew(*tab1, source, type,
542 diff->Start1, diff->End1,
543 score, '+', frame);
544
545 if(diff->Len1 == 1 && diff->Len2 == 1)
546 ajFmtPrintS(¬estr, "SNP in %S", ajSeqGetNameS(seq2));
547 else if(diff->Len2 == 0)
548 ajFmtPrintS(¬estr, "Insertion of %d bases in %S",
549 diff->Len1, ajSeqGetNameS(seq1));
550 else
551 ajFmtPrintS(¬estr, "%S", ajSeqGetNameS(seq2));
552
553 ajFeatTagSet(feature, note, notestr);
554
555 if(diff->Len2 > 0)
556 ajStrAssignSubS(&replacestr, ajSeqGetSeqS(seq2),
557 diff->Start2-1, diff->End2-1);
558 else
559 ajStrAssignC(&replacestr, "");
560
561 if(ajFeattableIsProt(*tab1))
562 {
563 if(ajStrGetLen(replacestr))
564 {
565 ajStrAssignSubS(&sourcestr, ajSeqGetSeqS(seq1),
566 diff->Start1-1, diff->End1-1);
567 ajFmtPrintS(&conflictstr, "%S -> %S",
568 sourcestr, replacestr);
569 }
570 else
571 ajFmtPrintS(&conflictstr, "MISSING");
572
573 ajFeatTagSet(feature, note, conflictstr);
574 }
575 else
576 ajFeatTagSet(feature, replace, replacestr);
577 }
578
579 if(diff->Len2)
580 { /* is there a gap between the matches? */
581 if(diff->Rev2)
582 feature = ajFeatNew(*tab2, source, type,
583 len2 - diff->Start2 + 1,
584 len2 - diff->End2 + 1,
585 score, '-', frame);
586 else
587 feature = ajFeatNew(*tab2, source, type,
588 diff->Start2, diff->End2,
589 score, '+', frame);
590
591 if(diff->Len2 == 1 && diff->Len1 == 1)
592 ajFmtPrintS(¬estr, "SNP in %S", ajSeqGetNameS(seq1));
593 else if(diff->Len1 == 0)
594 ajFmtPrintS(¬estr, "Insertion of %d bases in %S",
595 diff->Len2, ajSeqGetNameS(seq2));
596 else
597 ajFmtPrintS(¬estr, "%S", ajSeqGetNameS(seq1));
598
599 ajFeatTagSet(feature, note, notestr);
600
601 if(diff->Len1 > 0)
602 ajStrAssignSubS(&replacestr, ajSeqGetSeqS(seq1),
603 diff->Start1-1, diff->End1-1);
604 else
605 ajStrAssignC(&replacestr, "");
606
607 if(ajFeattableIsProt(*tab2))
608 {
609 if(ajStrGetLen(replacestr))
610 {
611 ajStrAssignSubS(&sourcestr, ajSeqGetSeqS(seq2),
612 diff->Start2-1, diff->End2-1);
613 ajFmtPrintS(&conflictstr, "%S -> %S",
614 sourcestr, replacestr);
615 }
616 else
617 ajFmtPrintS(&conflictstr, "MISSING");
618
619 ajFeatTagSet(feature, note, conflictstr);
620 }
621 else
622 ajFeatTagSet(feature, replace, replacestr);
623
624 }
625
626 }
627
628 ajListIterDel(&iter);
629 ajStrDel(&source);
630 ajStrDel(&type);
631 ajStrDel(¬e);
632 ajStrDel(&replace);
633 ajStrDel(&replacestr);
634 ajStrDel(&sourcestr);
635 ajStrDel(&conflictstr);
636 ajStrDel(¬estr);
637
638 return;
639 }
640
641
642
643
644 /* @funcstatic diffseq_Features ********************************************
645 **
646 ** Write out any features which overlap this region.
647 ** Don't write out the source feature - far too irritating!
648 **
649 ** @param [r] typefeat [const char*] Report feature tag type
650 ** @param [u] rf [AjPFeature] Report feature to store results in
651 ** @param [r] feat [const AjPFeattable] Feature table to search
652 ** @param [r] start [ajuint] Start position of region (in human coordinates)
653 ** @param [r] end [ajuint] End position of region (in human coordinates)
654 ** @return [void]
655 ** @@
656 ******************************************************************************/
657
diffseq_Features(const char * typefeat,AjPFeature rf,const AjPFeattable feat,ajuint start,ajuint end)658 static void diffseq_Features(const char* typefeat, AjPFeature rf,
659 const AjPFeattable feat,
660 ajuint start, ajuint end)
661 {
662 AjIList iter = NULL;
663 AjPFeature gf = NULL;
664 AjPStr tmp = NULL;
665
666 if(!feat)
667 return;
668
669 if(feat->Features)
670 {
671 iter = ajListIterNewread(feat->Features);
672 while(!ajListIterDone(iter))
673 {
674 gf = ajListIterGet(iter);
675
676
677 /* check that the feature is within the range for display */
678 if(start > ajFeatGetEnd(gf) || end < ajFeatGetStart(gf))
679 continue;
680
681 /* don't output the 'source' feature */
682 if(!ajStrCmpC(ajFeatGetType(gf), "source"))
683 continue;
684
685 /* write out the feature details */
686 ajFmtPrintS(&tmp, "*%s %S %d-%d",
687 typefeat, ajFeatGetType(gf),
688 ajFeatGetStart(gf), ajFeatGetEnd(gf));
689 diffseq_AddTags(&tmp, gf, ajTrue);
690 ajFeatTagAddSS(rf, NULL, tmp);
691
692 }
693 ajListIterDel(&iter) ;
694 }
695
696
697 ajStrDel(&tmp);
698
699 return;
700 }
701
702
703
704
705 /* @funcstatic diffseq_AddTags *********************************************
706 **
707 ** Appends feature tag values to a string in a simple format.
708 ** Don't write out the translation - is it often far too long!
709 **
710 ** @param [u] strval [AjPStr*] String
711 ** @param [r] feat [const AjPFeature] Feature to be processed
712 ** @param [r] values [AjBool] display values of tags
713 **
714 ** @return [void]
715 ** @@
716 ******************************************************************************/
717
diffseq_AddTags(AjPStr * strval,const AjPFeature feat,AjBool values)718 static void diffseq_AddTags(AjPStr* strval,
719 const AjPFeature feat, AjBool values)
720 {
721 AjIList titer; /* iterator for taglist */
722 AjPStr tagnam = NULL;
723 AjPStr tagval = NULL;
724
725 /* iterate through the tags and test for match to patterns */
726
727 titer = ajFeatTagIter(feat);
728 while(ajFeatTagval(titer, &tagnam, &tagval))
729 /* don't display the translation tag - it is far too long :-) */
730 if(!ajStrMatchC(tagnam, "translation"))
731 {
732 if(values == ajTrue)
733 ajFmtPrintAppS(strval, " %S='%S'", tagnam, tagval);
734 else
735 ajFmtPrintAppS(strval, " %S", tagnam);
736 }
737
738 ajListIterDel(&titer);
739 ajStrDel(&tagnam);
740 ajStrDel(&tagval);
741
742 return;
743 }
744
745
746
747
748 /* @funcstatic diffseq_DiffList ************************************************
749 **
750 ** Converts a list of matching regions into a list of differences
751 ** The positions are held in human coordinates (starting at 1)
752 ** rather than computer coordinates (starting at 0) because
753 ** there will be a lot of comparing/writing to feature tables
754 ** which use human coordinates.
755 ** over1start is returned with value '-1' if no matching regions were found.
756 **
757 ** @param [r] matchlist [const AjPList] List of minimal non-overlapping matches
758 ** @param [u] difflist [AjPList] Resulting list of differences
759 ** @param [r] global [AjBool] ajTrue if we want the differences at the ends
760 ** @param [r] seq1 [const AjPSeq] sequence 1
761 ** @param [r] seq2 [const AjPSeq] sequence 2
762 ** @param [w] over1start [ajint *] start of overlap region in sequence1
763 ** @param [w] over1end [ajint *] end of overlap region in sequence1
764 ** @param [w] over2start [ajint *] start of overlap region in sequence2
765 ** @param [w] over2end [ajint *] end of overlap region in sequence2
766 ** @return [void]
767 ** @@
768 ******************************************************************************/
769
diffseq_DiffList(const AjPList matchlist,AjPList difflist,AjBool global,const AjPSeq seq1,const AjPSeq seq2,ajint * over1start,ajint * over1end,ajint * over2start,ajint * over2end)770 static void diffseq_DiffList(const AjPList matchlist, AjPList difflist,
771 AjBool global,
772 const AjPSeq seq1, const AjPSeq seq2,
773 ajint *over1start, ajint *over1end,
774 ajint *over2start, ajint *over2end)
775 {
776
777 AjIList iter = NULL;
778 ajint misstart1 = -1; /* start of mismatch region in seq1 */
779 ajint misstart2 = -1; /* start of mismatch region in seq2 */
780 ajint misend1;
781 ajint misend2; /* end of mismatch region */
782 PosPDiff diff = NULL; /* Difference object */
783 ajint i;
784 ajint j;
785 const char *seqc1;
786 const char *seqc2;
787 AjBool rev1 = ajFalse;
788 AjBool rev2 = ajFalse;
789
790 seqc1 = ajSeqGetSeqC(seq1);
791 seqc2 = ajSeqGetSeqC(seq2);
792
793 rev1 = ajSeqIsReversed(seq1);
794 rev2 = ajSeqIsReversed(seq2);
795
796 *over1start = -1; /* flag for no matches found */
797
798 iter = ajListIterNewread(matchlist);
799 while(!ajListIterDone(iter))
800 {
801 EmbPWordMatch p =(EmbPWordMatch) ajListIterGet(iter) ;
802
803 ajDebug("diffseq match len: %d seq1 %u seq2 %u\n",
804 p->length, p->seq1start, p->seq2start);
805
806 misend1 = p->seq1start;
807 misend2 = p->seq2start;
808
809 if (misstart1 == -1) /* this is the first iteration */
810 {
811 /* note the end of the overlap (start of first match) */
812 *over1start = p->seq1start + 1;
813 *over2start = p->seq2start + 1;
814
815 /* add difference at the start, if required */
816 if(global && /* we want the global differences */
817 (p->seq1start != 0 ||
818 p->seq2start != 0)) /* no match at pos 1,1 */
819 {
820 diff = diffseq_PosPDiffNew();
821 /*
822 ** look for matches of less than the size of a word that
823 ** will have been missed by the word-match routines, but which
824 ** look weird if they are described as differences in
825 ** the output.
826 */
827 diff->Start1 = 1;
828 diff->Start2 = 1;
829 diff->End1 = misend1;
830 diff->End2 = misend2;
831 diff->Len1 = diff->End1 - diff->Start1 + 1;
832 diff->Len2 = diff->End2 - diff->Start2 + 1;
833 diff->Rev1 = rev1;
834 diff->Rev2 = rev2;
835
836 /*
837 ** If there are mismatches on both sequences, see if we can
838 ** make a small match at the starts to tidy things up a bit.
839 ** In other words, we increase the start of the mismatch if
840 ** there are any matching bases there.
841 */
842 for (i=0;
843 diff->Len1 && diff->Len2 &&
844 tolower((ajint)seqc1[i]) == tolower((ajint)seqc2[i]); i++)
845 {
846 diff->Len1--;
847 diff->Len2--;
848 diff->Start1++;
849 diff->Start2++;
850 }
851
852 /* add node to the end of the list */
853 ajListPushAppend(difflist, diff);
854 }
855 }
856 else /* this is a mismatch between two matches */
857 {
858 diff = diffseq_PosPDiffNew();
859 diff->Start1 = misstart1;
860 diff->Start2 = misstart2;
861 diff->End1 = misend1;
862 diff->End2 = misend2;
863 diff->Len1 = diff->End1 - diff->Start1 + 1;
864 diff->Len2 = diff->End2 - diff->Start2 + 1;
865 diff->Rev1 = rev1;
866 diff->Rev2 = rev2;
867
868 ajDebug("diffseq diff seq1 %d..%d (%d) seq2: %d..%d (%d)\n",
869 diff->Start1, diff->End1, diff->Len1,
870 diff->Start2, diff->End2, diff->Len2);
871
872 /* add node to the end of the list */
873 ajListPushAppend(difflist, diff);
874 }
875
876 /* note the start position of the next mismatch */
877 misstart1 = p->seq1start + p->length + 1;
878 misstart2 = p->seq2start + p->length + 1;
879 }
880
881 /* note the end of the overlap (end of last match) */
882 *over1end = misstart1-1;
883 *over2end = misstart2-1;
884
885 /* add difference at the end, if required */
886 if(global && /* we want the global differences */
887 (misstart1 <= (ajint) ajSeqGetLen(seq1) || /* no match at the end */
888 misstart2 <= (ajint) ajSeqGetLen(seq2)))
889 {
890 diff = diffseq_PosPDiffNew();
891 diff->Start1 = misstart1;
892 diff->Start2 = misstart2;
893 diff->End1 = ajSeqGetLen(seq1);
894 diff->End2 = ajSeqGetLen(seq2);
895 diff->Len1 = diff->End1 - diff->Start1 + 1;
896 diff->Len2 = diff->End2 - diff->Start2 + 1;
897 diff->Rev1 = rev1;
898 diff->Rev2 = rev2;
899
900 /*
901 ** If there are mismatches on both sequences, see if we can
902 ** make a small match at the ends to tidy things up a bit.
903 ** In other words, we reduce the end of the mismatch if there
904 ** are any matching bases there.
905 */
906 for (i=ajSeqGetLen(seq1), j=ajSeqGetLen(seq2);
907 diff->Len1 && diff->Len2 &&
908 tolower((ajint)seqc1[i-1]) == tolower((ajint)seqc2[j-1]);
909 i--, j--)
910 {
911 diff->Len1--;
912 diff->Len2--;
913 diff->End1--;
914 diff->End2--;
915 }
916 /* add node to the end of the list */
917 ajListPushAppend(difflist, diff);
918 }
919
920 ajListIterDel(&iter);
921
922 return;
923 }
924
925
926
927
928 /* @funcstatic diffseq_PosPDiffNew ********************************************
929 **
930 ** Constructor for an empty PosPDiff object
931 **
932 ** @return [PosPDiff] Diference object
933 ** @category new [PosPDiff] Constructor
934 ** @@
935 ******************************************************************************/
936
diffseq_PosPDiffNew(void)937 static PosPDiff diffseq_PosPDiffNew(void)
938 {
939 PosPDiff pthis;
940 AJNEW0(pthis);
941
942 return pthis;
943 }
944
945
946
947
948 /* @funcstatic diffseq_PosPDiffDel ********************************************
949 **
950 ** Destructor for a PosPDiff object for use with ajListMap
951 **
952 ** @param [r] x [void**] Undocumented
953 ** @param [r] cl [void*] Undocumented
954 ** @return [void]
955 ** @@
956 ******************************************************************************/
957
diffseq_PosPDiffDel(void ** x,void * cl)958 static void diffseq_PosPDiffDel(void **x, void *cl)
959 {
960 PosPDiff thys;
961 thys = (PosPDiff)*x;
962
963 (void) cl; /* make it used */
964
965 AJFREE(thys);
966
967 return;
968 }
969
970
971
972
973 /* @funcstatic diffseq_FeatSetCDSFrame ****************************************
974 **
975 ** The Feature object in the ajfeat library has a field 'Frame' but it is
976 ** often left as '0' (unknown) in CDS features. This routine fixes it.
977 ** This routine assumes that the CDS features within a join()
978 ** are sorted by start position.
979 **
980 ** Frame value of 0 is unknown, values 1,2,3 are equal to GFF phases 0,1,2
981 **
982 ** @param [u] ftab [AjPFeattable] Feature table
983 ** @return [void]
984 ** @@
985 ******************************************************************************/
986
diffseq_FeatSetCDSFrame(AjPFeattable ftab)987 static void diffseq_FeatSetCDSFrame(AjPFeattable ftab)
988 {
989
990 AjIList iter = NULL; /* feature table iterator */
991 AjIList titer = NULL; /* feature tags iterator */
992 AjPFeature gf = NULL;
993 ajint phase = 0; /* translation phase, 0,1 or 2 */
994 ajint prevstart = 0; /* start and end of previous CDS in table */
995 ajint prevend = 0;
996 AjBool prevparent; /* flag true if prev CDS was a parent */
997 AjBool unsure; /* true if we are unsure of the phase */
998 AjPStr tagnam = NULL;/* name and value of tags of the feature */
999 AjPStr tagval = NULL;
1000
1001 unsure = ajFalse;
1002
1003 if(!ftab)
1004 return;
1005
1006 if(!ftab->Features)
1007 return;
1008
1009 iter = ajListIterNewread(ftab->Features);
1010 while(!ajListIterDone(iter))
1011 {
1012 gf = ajListIterGet(iter);
1013
1014 /* is this a CDS feature? */
1015 if (ajStrCmpC(ajFeatGetType(gf), "CDS"))
1016 continue;
1017
1018 /* is this a forward sense CDS? */
1019 if(gf->Strand == '+')
1020 {
1021
1022 /* is this the start of a new group of CDS in a multi-exon gene? */
1023 /* reset the phase to zero, or continue it accordingly */
1024 phase = 0;
1025 unsure = ajFalse;
1026
1027 /* are we unsure about the start of this CDS? */
1028 if ((gf->Flags & AJFEATFLAG_START_BEFORE_SEQ) ||
1029 (gf->Flags & AJFEATFLAG_START_UNSURE) ||
1030 (gf->Flags & AJFEATFLAG_START_TWO))
1031 unsure = ajTrue;
1032
1033 /* check to see if the /codon_start tag is set on this feature */
1034
1035 /* check to see if the /codon_start tag is set on this feature */
1036 titer = ajFeatTagIter(gf);
1037 while(ajFeatTagval(titer, &tagnam, &tagval))
1038 if(!ajStrCmpC(tagnam, "codon_start"))
1039 {
1040 ajStrToInt(tagval, &phase);
1041 phase--;
1042 unsure = ajFalse;
1043 }
1044 ajListIterDel(&titer);
1045
1046
1047 /* If we are sure, set the Frame field in the feature
1048 ** object.
1049 */
1050 if (unsure)
1051 gf->Frame = 0;
1052 else
1053 if (gf->Frame == 0)
1054 gf->Frame = phase+1;
1055 }
1056
1057 /* are we unsure about the end of this CDS? */
1058 if ((gf->Flags & AJFEATFLAG_END_AFTER_SEQ) ||
1059 (gf->Flags & AJFEATFLAG_END_UNSURE) ||
1060 (gf->Flags & AJFEATFLAG_END_TWO))
1061 unsure = ajTrue;
1062
1063 /* remember the start/end of this feature */
1064 prevstart = ajFeatGetStart(gf);
1065 prevend = ajFeatGetEnd(gf);
1066 }
1067
1068 ajListIterDel(&iter);
1069
1070 /*
1071 ** Go back up through list filling in the frame of the reverse sense
1072 ** CDS features.
1073 */
1074 unsure = ajFalse;
1075 prevparent = ajTrue;
1076 iter = ajListIterNewBack(ftab->Features);
1077 while(!ajListIterDoneBack(iter))
1078 {
1079 gf = ajListIterGetBack(iter);
1080
1081 /* is this a CDS feature? */
1082 if (ajStrCmpC(ajFeatGetType(gf), "CDS"))
1083 continue;
1084
1085 /* is this a reverse sense CDS? */
1086 if(gf->Strand == '-')
1087 {
1088 /* is this a new CDS group? - set the phase */
1089 if(prevparent)
1090 {
1091 phase = 0;
1092 unsure = ajFalse;
1093 }
1094 else
1095 {
1096 phase += (prevend + 1 - prevstart) % 3;
1097 phase %= 3;
1098 }
1099
1100 /* are we unsure about the end of this CDS? */
1101 if ((gf->Flags & AJFEATFLAG_END_AFTER_SEQ) ||
1102 (gf->Flags & AJFEATFLAG_END_UNSURE) ||
1103 (gf->Flags & AJFEATFLAG_END_TWO))
1104 unsure = ajTrue;
1105
1106 /* check to see if the /codon_start tag is set on this feature */
1107 titer = ajFeatTagIter(gf);
1108 while(ajFeatTagval(titer, &tagnam, &tagval))
1109 if(!ajStrCmpC(tagnam, "codon_start"))
1110 {
1111 ajStrToInt(tagval, &phase);
1112 phase--;
1113 unsure = ajFalse;
1114 }
1115 ajListIterDel(&titer);
1116
1117 /*
1118 ** If we are sure, set the Frame field in the feature
1119 ** object.
1120 */
1121 if (unsure)
1122 gf->Frame = 0;
1123 else
1124 if (gf->Frame == 0)
1125 gf->Frame = phase+1;
1126 }
1127
1128 /* remember the parental status of this CDS */
1129 prevparent = ajTrue;
1130
1131 /* are we unsure about the start of this CDS? */
1132 if ((gf->Flags & AJFEATFLAG_START_BEFORE_SEQ) ||
1133 (gf->Flags & AJFEATFLAG_START_UNSURE) ||
1134 (gf->Flags & AJFEATFLAG_START_TWO))
1135 unsure = ajTrue;
1136
1137 /* remember the start/end of this feature */
1138 prevstart = ajFeatGetStart(gf);
1139 prevend = ajFeatGetEnd(gf);
1140 }
1141
1142 ajListIterDel(&iter);
1143 ajListIterDel(&titer);
1144 ajStrDel(&tagnam);
1145 ajStrDel(&tagval);
1146
1147 return;
1148 }
1149