1 /* @source diffseq application
2 **
3 ** Find differences (SNPs) between nearly identical sequences
4 **
5 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21 ******************************************************************************/
22 
23 #include "emboss.h"
24 
25 
26 
27 
28 /* @datastatic CdsPval ****************************************************
29 **
30 ** CDS information object
31 **
32 ** @alias CdsSval
33 ** @alias CdsOval
34 **
35 ** @attr Start [ajint] Start of CDS (always less than End)
36 ** @attr End [ajint] End of CDS (always greater than Start)
37 ** @attr Phase [ajint] Phase of translation (0,1 or 2)
38 ** @attr Parent [AjBool] ajTrue is this CDS is a parent of a forward sense join
39 **			or the last CDS of a reverse sense join
40 ** @attr Single [AjBool] ajTrue is this CDS is a not member of a join()
41 ** @attr ReverseParent [AjBool] ajTrue is this CDS is a parent of rev sense join
42 **			or the last CDS in a forward sense join
43 ** @attr Local [AjBool] ajTrue is this CDS is local
44 ** @attr Sense [char] Sense '+' or '-'
45 ** @attr Padding [char[3]] Padding to alignment boundary
46 ** @@
47 ******************************************************************************/
48 
49 typedef struct CdsSval
50 {
51     ajint  Start;
52     ajint  End;
53     ajint  Phase;
54     AjBool Parent;
55     AjBool Single;
56     AjBool ReverseParent;
57     AjBool Local;
58     char   Sense;
59     char Padding[3];
60 } CdsOval;
61 #define CdsPval CdsOval*
62 
63 
64 
65 
66 /* @datastatic PosPDiff ****************************************************
67 **
68 ** Position of a difference between two matching regions
69 ** If there is something inserted in one sequence that does not
70 ** occur in the other, then the other has Start after the gap and
71 ** End before the gap and Len=0.
72 **
73 ** @alias PosSDiff
74 ** @alias PosODiff
75 **
76 ** @attr Start1 [ajint] Start of difference in sequence 1 (End1+1 if gap)
77 ** @attr Start2 [ajint] Start of difference in sequence 2 (End1+1 if gap)
78 ** @attr End1 [ajint] End of difference in seq 1
79 ** @attr End2 [ajint] End of difference in seq 2
80 ** @attr Len1 [ajint] Length of difference in sequence 1 (0 = a gap)
81 ** @attr Len2 [ajint] Length of difference in sequence 2 (0 = a gap)
82 ** @attr Rev1 [AjBool] Reverse sequence 1
83 ** @attr Rev2 [AjBool] Reverse sequence 2
84 ** @@
85 ******************************************************************************/
86 
87 typedef struct PosSDiff
88 {
89     ajint  Start1;
90     ajint  Start2;
91     ajint  End1;
92     ajint  End2;
93     ajint  Len1;
94     ajint  Len2;
95     AjBool Rev1;
96     AjBool Rev2;
97 } PosODiff;
98 #define PosPDiff PosODiff*
99 
100 
101 static void diffseq_Diff(const AjPList difflist,
102                          const AjPSeq seq1, const AjPSeq seq2,
103                          AjPReport report, AjPFeattable ftab,
104                          ajint over1start, ajint over1end,
105                          ajint over2start, ajint over2end);
106 
107 static void diffseq_WordMatchListConvDiffToFeat(const AjPList list,
108                                                 AjPFeattable *tab1,
109                                                 AjPFeattable *tab2,
110                                                 const AjPSeq seq1,
111                                                 const AjPSeq seq2);
112 
113 static void diffseq_Features(const char* typefeat, AjPFeature rf,
114                              const AjPFeattable feat,
115                              ajuint start, ajuint end);
116 
117 static void diffseq_AddTags(AjPStr* strval, const AjPFeature feat,
118                             AjBool values);
119 
120 static void diffseq_DiffList(const AjPList matchlist, AjPList difflist,
121                              AjBool global, const AjPSeq seq1,
122                              const AjPSeq seq2, ajint *over1start,
123                              ajint *over1end, ajint *over2start,
124                              ajint *over2end);
125 
126 static PosPDiff diffseq_PosPDiffNew(void);
127 
128 static void diffseq_PosPDiffDel(void **x, void *cl);
129 
130 static void diffseq_FeatSetCDSFrame(AjPFeattable ftab);
131 
132 
133 
134 
135 /* @prog diffseq **************************************************************
136 **
137 ** Find differences (SNPs) between nearly identical sequences
138 **
139 ******************************************************************************/
140 
main(int argc,char ** argv)141 int main(int argc, char **argv)
142 {
143     AjPSeq seq1;
144     AjPSeq seq2;
145     ajint wordlen;
146     AjPTable seq1MatchTable = 0;
147     AjPList matchlist = NULL;
148     AjPList difflist = NULL;
149     AjPReport report;
150     AjPFeattable Tab1 = NULL;
151     AjPFeattable Tab2 = NULL;
152     AjPFeattable TabRpt   = NULL;
153     AjPFeattabOut seq1out = NULL;
154     AjPFeattabOut seq2out = NULL;
155     AjPStr tmpstr=NULL;
156     AjBool global;    /* true if want to report the differences at the ends */
157     ajint over1start;	/* start and end positions of match overlap */
158     ajint over1end;	        /* ... for sequence 1 */
159     ajint over2start = 0;	/* overlap for sequence 2 */
160     ajint over2end;
161 
162 
163     embInit("diffseq", argc, argv);
164 
165     wordlen = ajAcdGetInt("wordsize");
166     seq1    = ajAcdGetSeq("asequence");
167     seq2    = ajAcdGetSeq("bsequence");
168     report  = ajAcdGetReport("outfile");
169     seq1out = ajAcdGetFeatout("aoutfeat");
170     seq2out = ajAcdGetFeatout("boutfeat");
171 
172     /* advanced qualifiers */
173     global        = ajAcdGetBoolean("globaldifferences");
174 
175     ajSeqTrim(seq1);
176     ajSeqTrim(seq2);
177 
178     TabRpt = ajFeattableNewSeq(seq1);
179 
180     embWordLength(wordlen);
181     if(embWordGetTable(&seq1MatchTable, seq1))
182 	/* get table of words */
183 	matchlist = embWordBuildMatchTable(seq1MatchTable, seq2, ajTrue);
184 
185 
186     /* get the minimal set of overlapping matches */
187     if(matchlist)
188 	embWordMatchMin(matchlist);
189 
190 
191     if(matchlist)
192     {
193         /* convert the list of matches to a list of differences */
194         difflist = ajListNew();
195         diffseq_DiffList(matchlist, difflist, global, seq1, seq2,
196                          &over1start, &over1end, &over2start, &over2end);
197 
198 
199 	/* output the gff files */
200 	diffseq_WordMatchListConvDiffToFeat(difflist, &Tab1, &Tab2,
201 						   seq1, seq2);
202 
203 
204 	diffseq_Diff(difflist, seq1, seq2, report, TabRpt,
205 	             over1start, over1end, over2start, over2end);
206 
207 
208 	embWordMatchListDelete(&matchlist); /* free the match structures */
209 
210         /* delete difflist */
211         ajListMap(difflist, &diffseq_PosPDiffDel, NULL);
212         ajListFree(&difflist);
213 
214     }
215     ajReportSetStatistics(report, 2,
216                           ajSeqGetLenTrimmed(seq1)+ajSeqGetLenTrimmed(seq2));
217 
218     ajFeattableWrite(seq1out, Tab1);
219     ajFeattableWrite(seq2out, Tab2);
220 
221     tmpstr = ajStrNew();
222     ajFmtPrintS(&tmpstr, "Feature file for first sequence");
223     ajReportAddFileF(report, ajFeattabOutFile(seq1out), tmpstr);
224 
225     ajFmtPrintS(&tmpstr, "Feature file for second sequence");
226     ajReportAddFileF(report, ajFeattabOutFile(seq2out), tmpstr);
227 
228     ajReportWrite(report, TabRpt, seq1);
229     ajReportClose(report);
230 
231     /* tidy up */
232     ajSeqDel(&seq1);
233     ajSeqDel(&seq2);
234 
235     embWordFreeTable(&seq1MatchTable);
236 
237     ajReportDel(&report);
238 
239     ajListFree(&matchlist);
240     ajListFree(&difflist);
241 
242     ajFeattableDel(&Tab1);
243     ajFeattableDel(&Tab2);
244     ajFeattableDel(&TabRpt);
245     ajFeattabOutDel(&seq1out);
246     ajFeattabOutDel(&seq2out);
247 
248     ajStrDel(&tmpstr);
249 
250 
251     embExit();
252     return 0;
253 }
254 
255 
256 
257 
258 /* @funcstatic diffseq_Diff ************************************************
259 **
260 ** Do a diff and build a report on the diff of the two sequences.
261 **
262 ** @param [r] difflist [const AjPList] List of differences
263 ** @param [r] seq1 [const AjPSeq] Sequence to be diff'd.
264 ** @param [r] seq2 [const AjPSeq] Sequence to be diff'd.
265 ** @param [u] report [AjPReport] Report object.
266 ** @param [u] ftab [AjPFeattable] Report feature table
267 ** @param [r] over1start [ajint] start of overlap region in sequence1
268 ** @param [r] over1end [ajint] end of overlap region in sequence1
269 ** @param [r] over2start [ajint] start of overlap region in sequence2
270 ** @param [r] over2end [ajint] end of overlap region in sequence2
271 ** @return [void]
272 ** @@
273 ******************************************************************************/
274 
diffseq_Diff(const AjPList difflist,const AjPSeq seq1,const AjPSeq seq2,AjPReport report,AjPFeattable ftab,ajint over1start,ajint over1end,ajint over2start,ajint over2end)275 static void diffseq_Diff(const AjPList difflist,
276                          const AjPSeq seq1, const AjPSeq seq2,
277                          AjPReport report, AjPFeattable ftab,
278                          ajint over1start, ajint over1end,
279                          ajint over2start, ajint over2end)
280 {
281     AjIList iter = NULL;	/* match list iterator */
282     PosPDiff diff = NULL;	/* difference structure */
283     const AjPStr s1;		/* string of seq1 */
284     const AjPStr s2;		/* string of seq2 */
285     AjPStr tmp;			/* temporary string */
286 
287     /* stuff for counting SNPs, transitions & transversions */
288     ajint snps = 0;
289     ajint transitions   = 0;
290     ajint transversions = 0;	/* counts of SNP types */
291     char base1 = '\0';
292     char base2 = '\0';
293     AjPStr tmpstr = NULL;
294     static AjPStr tmpseq = NULL;
295 
296     AjPFeattable feat1 = NULL;
297     AjPFeattable feat2 = NULL;
298 
299     AjPFeature gf = NULL;
300     ajuint len1 = 0;
301     ajuint len2 = 0;
302 
303     tmpstr = ajStrNew();
304     tmp    = ajStrNew();
305 
306     len1 = ajSeqGetLen(seq1);
307     len2 = ajSeqGetLen(seq2);
308 
309     s1 = ajSeqGetSeqS(seq1);
310     s2 = ajSeqGetSeqS(seq2);
311 
312     feat1 = ajSeqGetFeatCopy(seq1);
313     feat2 = ajSeqGetFeatCopy(seq2);
314 
315     /*
316     ** Fix the Frame field in the CDS features of the feature table for
317     ** seq1. These are often left as 'unsure' by the ajfeat routines.
318     */
319     diffseq_FeatSetCDSFrame(feat1);
320 
321     /* create report header */
322     ajFmtPrintS(&tmpstr, "Compare: %S     from: %d   to: %d\n\n",
323 		ajReportGetSeqnameSeq(report, seq2),
324 		ajSeqGetBegin(seq2), ajSeqGetEnd(seq2));
325     if (over1start != -1)
326     {
327         ajFmtPrintAppS(&tmpstr, "%S overlap starts at %d\n",
328 			   ajReportGetSeqnameSeq(report, seq1),
329 			   over1start);
330         ajFmtPrintAppS(&tmpstr, "%S overlap starts at %d\n\n",
331 			   ajReportGetSeqnameSeq(report, seq2),
332 			   over2start);
333     }
334     ajReportSetHeaderS(report, tmpstr);
335 
336 
337     iter = ajListIterNewread(difflist);
338     while(!ajListIterDone(iter))
339     {
340 	diff = (PosPDiff) ajListIterGet(iter) ;
341 
342         /* seq 1 details */
343         if(diff->Len1 > 0)
344 	{
345             if(diff->Rev1)
346             {
347                 gf = ajFeatNewIIRev(ftab,
348                                     (len1 - diff->End1 + 1),
349                                     (len1 - diff->Start1 + 1));
350             }
351             else
352             {
353                 gf = ajFeatNewII(ftab, diff->Start1, diff->End1);
354             }
355 
356             ajStrAssignSubS(&tmp, s1, diff->Start1-1, diff->End1-1);
357             base1 = * ajStrGetPtr(tmp);
358 	}
359         else
360         {
361             gf = ajFeatNewBetween(ftab, diff->Start1);
362             ajStrAssignC(&tmp, "");
363         }
364         diffseq_Features("first_feature", gf,
365                             feat1, diff->Start1, diff->End1);
366 
367         /* seq2 details */
368         diffseq_Features("second_feature", gf,
369                             feat2, diff->Start2, diff->End2);
370 
371         ajFmtPrintS(&tmp, "*name %S", ajReportGetSeqnameSeq(report, seq2));
372         ajFeatTagAddSS(gf, NULL, tmp);
373 
374         ajFmtPrintS(&tmp, "*length %d", diff->Len2);
375         ajFeatTagAddSS(gf, NULL, tmp);
376 
377         if(diff->Rev2)
378         {
379             ajFmtPrintS(&tmp, "*strand -");
380             ajFeatTagAddSS(gf, NULL, tmp);
381         }
382 
383         if(diff->Len2 > 0)
384         {
385             if(diff->Rev2)
386             {
387                 ajFmtPrintS(&tmp, "*start %d", (len2 - diff->End2 + 1));
388                 ajFeatTagAddSS(gf, NULL, tmp);
389                 ajFmtPrintS(&tmp, "*end %d", (len2 - diff->Start2 + 1));
390                 ajFeatTagAddSS(gf, NULL, tmp);
391             }
392             else
393             {
394                 ajFmtPrintS(&tmp, "*start %d", diff->Start2);
395                 ajFeatTagAddSS(gf, NULL, tmp);
396                 ajFmtPrintS(&tmp, "*end %d", diff->End2);
397                 ajFeatTagAddSS(gf, NULL, tmp);
398             }
399 
400             ajStrAssignSubS(&tmpseq, s2, diff->Start2-1, diff->End2-1);
401             ajFmtPrintS(&tmp, "*sequence %S", tmpseq);
402             ajFeatTagAddSS(gf, NULL, tmp);
403             base2 = * ajStrGetPtr(tmpseq);
404         }
405         else
406         {
407             if(diff->Rev2)
408             {
409                 ajFmtPrintS(&tmp, "*start %d", (len2 - diff->Start2 + 1));
410                 ajFeatTagAddSS(gf, NULL, tmp);
411                 ajFmtPrintS(&tmp, "*end %d", (len2 - diff->Start2 + 1));
412                 ajFeatTagAddSS(gf, NULL, tmp);
413             }
414             else
415             {
416                 ajFmtPrintS(&tmp, "*start %d", diff->End2);
417                 ajFeatTagAddSS(gf, NULL, tmp);
418                 ajFmtPrintS(&tmp, "*end %d", diff->End2);
419                 ajFeatTagAddSS(gf, NULL, tmp);
420             }
421         }
422 
423         /* count SNPs, transitions & transversions */
424         if(diff->Len1 == 1 && diff->Len2 == 1)
425         {
426             snps++;
427             transitions += (ajint) embPropTransition(base1, base2);
428             transversions += (ajint) embPropTransversion(base1, base2);
429         }
430 
431     }
432 
433     ajListIterDel(&iter);
434 
435     /* create report tail */
436     if(over1start != -1)
437     {
438         ajFmtPrintS(&tmp, "Overlap_end: %d in %S\n",
439                     over1end,
440                     ajReportGetSeqnameSeq(report, seq1));
441         ajFmtPrintAppS(&tmp, "Overlap_end: %d in %S\n",
442                        over2end,
443                        ajReportGetSeqnameSeq(report, seq2));
444         if(ajSeqIsNuc(seq1))
445         {
446             ajFmtPrintAppS(&tmp, "\n");
447             ajFmtPrintAppS(&tmp, "SNP_count: %d\n", snps);
448             ajFmtPrintAppS(&tmp, "Transitions: %d\n", transitions);
449             ajFmtPrintAppS(&tmp, "Transversions: %d\n", transversions);
450         }
451     }
452     else
453         /* no iterations of the match list done - ie no matches */
454         ajFmtPrintS(&tmp, "No regions of alignment found.\n");
455 
456     ajReportSetTailS(report, tmp);
457 
458 
459     ajStrDel(&tmp);
460     ajFeattableDel(&feat1);
461     ajFeattableDel(&feat2);
462 
463     ajStrDel(&tmpstr);
464     ajStrDel(&tmpseq);
465 
466     return;
467 }
468 
469 
470 
471 
472 /* @funcstatic diffseq_WordMatchListConvDiffToFeat ****************************
473 **
474 ** Convert the differences list to feature tables for output.
475 **
476 ** @param [r] list [const AjPList] differences list
477 ** @param [u] tab1 [AjPFeattable*] feature table for sequence 1
478 ** @param [u] tab2 [AjPFeattable*] feature table for sequence 2
479 ** @param [r] seq1 [const AjPSeq] sequence 1
480 ** @param [r] seq2 [const AjPSeq] sequence 2
481 ** @return [void]
482 ** @@
483 ******************************************************************************/
484 
diffseq_WordMatchListConvDiffToFeat(const AjPList list,AjPFeattable * tab1,AjPFeattable * tab2,const AjPSeq seq1,const AjPSeq seq2)485 static void diffseq_WordMatchListConvDiffToFeat(const AjPList list,
486                                                 AjPFeattable *tab1,
487                                                 AjPFeattable *tab2,
488                                                 const AjPSeq seq1,
489                                                 const AjPSeq seq2)
490 {
491     ajint frame = 0;
492     AjPStr source  = NULL;
493     AjPStr type    = NULL;
494     AjPStr note    = NULL;
495     AjPStr replace = NULL;
496     AjPFeature feature = NULL;
497     AjIList iter    = NULL;
498     AjPStr notestr     = NULL;
499     AjPStr replacestr  = NULL;
500     AjPStr sourcestr   = NULL;
501     AjPStr conflictstr = NULL;
502     float score = 0.0;
503     ajuint len1 = 0;
504     ajuint len2 = 0;
505 
506     if(!*tab1)
507         *tab1 = ajFeattableNewSeq(seq1);
508 
509     if(!*tab2)
510         *tab2 = ajFeattableNewSeq(seq2);
511 
512     source     = ajStrNew();
513     type       = ajStrNew();
514     note       = ajStrNew();
515     replace    = ajStrNew();
516     replacestr = ajStrNew();
517     notestr    = ajStrNew();
518 
519     ajStrAssignC(&source,"diffseq");
520     ajStrAssignC(&type,"conflict");
521     ajStrAssignC(&note,"note");
522     ajStrAssignC(&replace,"replace");
523     score = 1.0;
524 
525     len1 = ajSeqGetLen(seq1);
526     len2 = ajSeqGetLen(seq2);
527 
528     iter = ajListIterNewread(list);
529     while(!ajListIterDone(iter))
530     {
531         PosPDiff diff = (PosPDiff) ajListIterGet(iter);
532 
533         if(diff->Len1)
534         {        /* is there a gap between the matches? */
535             if(diff->Rev1)
536                 feature = ajFeatNew(*tab1, source, type,
537                                     len1 - diff->Start1 + 1,
538                                     len1 - diff->End1 + 1,
539                                     score, '-', frame);
540             else
541                 feature = ajFeatNew(*tab1, source, type,
542                                     diff->Start1, diff->End1,
543                                     score, '+', frame);
544 
545             if(diff->Len1 == 1 && diff->Len2 == 1)
546                 ajFmtPrintS(&notestr, "SNP in %S", ajSeqGetNameS(seq2));
547             else if(diff->Len2 == 0)
548                 ajFmtPrintS(&notestr, "Insertion of %d bases in %S",
549                             diff->Len1, ajSeqGetNameS(seq1));
550             else
551                 ajFmtPrintS(&notestr, "%S", ajSeqGetNameS(seq2));
552 
553             ajFeatTagSet(feature, note, notestr);
554 
555             if(diff->Len2 > 0)
556                 ajStrAssignSubS(&replacestr, ajSeqGetSeqS(seq2),
557                                 diff->Start2-1, diff->End2-1);
558             else
559                 ajStrAssignC(&replacestr, "");
560 
561             if(ajFeattableIsProt(*tab1))
562             {
563                 if(ajStrGetLen(replacestr))
564                 {
565                     ajStrAssignSubS(&sourcestr, ajSeqGetSeqS(seq1),
566                                     diff->Start1-1, diff->End1-1);
567                     ajFmtPrintS(&conflictstr, "%S -> %S",
568                                 sourcestr, replacestr);
569                 }
570                 else
571                     ajFmtPrintS(&conflictstr, "MISSING");
572 
573                 ajFeatTagSet(feature, note, conflictstr);
574             }
575             else
576                 ajFeatTagSet(feature, replace, replacestr);
577         }
578 
579         if(diff->Len2)
580         {        /* is there a gap between the matches? */
581             if(diff->Rev2)
582                 feature = ajFeatNew(*tab2, source, type,
583                                     len2 - diff->Start2 + 1,
584                                     len2 - diff->End2 + 1,
585                                     score, '-', frame);
586             else
587                 feature = ajFeatNew(*tab2, source, type,
588                                     diff->Start2, diff->End2,
589                                     score, '+', frame);
590 
591             if(diff->Len2 == 1 && diff->Len1 == 1)
592                 ajFmtPrintS(&notestr, "SNP in %S", ajSeqGetNameS(seq1));
593             else if(diff->Len1 == 0)
594                 ajFmtPrintS(&notestr, "Insertion of %d bases in %S",
595                             diff->Len2, ajSeqGetNameS(seq2));
596             else
597                 ajFmtPrintS(&notestr, "%S", ajSeqGetNameS(seq1));
598 
599             ajFeatTagSet(feature, note, notestr);
600 
601             if(diff->Len1 > 0)
602                 ajStrAssignSubS(&replacestr, ajSeqGetSeqS(seq1),
603                                 diff->Start1-1, diff->End1-1);
604             else
605                 ajStrAssignC(&replacestr, "");
606 
607             if(ajFeattableIsProt(*tab2))
608             {
609                 if(ajStrGetLen(replacestr))
610                 {
611                     ajStrAssignSubS(&sourcestr, ajSeqGetSeqS(seq2),
612                                     diff->Start2-1, diff->End2-1);
613                     ajFmtPrintS(&conflictstr, "%S -> %S",
614                                 sourcestr, replacestr);
615                 }
616                 else
617                     ajFmtPrintS(&conflictstr, "MISSING");
618 
619                 ajFeatTagSet(feature, note, conflictstr);
620             }
621             else
622                 ajFeatTagSet(feature, replace, replacestr);
623 
624         }
625 
626     }
627 
628     ajListIterDel(&iter);
629     ajStrDel(&source);
630     ajStrDel(&type);
631     ajStrDel(&note);
632     ajStrDel(&replace);
633     ajStrDel(&replacestr);
634     ajStrDel(&sourcestr);
635     ajStrDel(&conflictstr);
636     ajStrDel(&notestr);
637 
638     return;
639 }
640 
641 
642 
643 
644 /* @funcstatic diffseq_Features ********************************************
645 **
646 ** Write out any features which overlap this region.
647 ** Don't write out the source feature - far too irritating!
648 **
649 ** @param [r] typefeat [const char*] Report feature tag type
650 ** @param [u] rf [AjPFeature] Report feature to store results in
651 ** @param [r] feat [const AjPFeattable] Feature table to search
652 ** @param [r] start [ajuint] Start position of region (in human coordinates)
653 ** @param [r] end [ajuint] End position of region (in human coordinates)
654 ** @return [void]
655 ** @@
656 ******************************************************************************/
657 
diffseq_Features(const char * typefeat,AjPFeature rf,const AjPFeattable feat,ajuint start,ajuint end)658 static void diffseq_Features(const char* typefeat, AjPFeature rf,
659                                 const AjPFeattable feat,
660                                 ajuint start, ajuint end)
661 {
662     AjIList iter  = NULL;
663     AjPFeature gf = NULL;
664     AjPStr tmp    = NULL;
665 
666     if(!feat)
667         return;
668 
669     if(feat->Features)
670     {
671         iter = ajListIterNewread(feat->Features);
672         while(!ajListIterDone(iter))
673         {
674             gf = ajListIterGet(iter);
675 
676 
677             /* check that the feature is within the range for display */
678             if(start > ajFeatGetEnd(gf) || end < ajFeatGetStart(gf))
679                 continue;
680 
681             /* don't output the 'source' feature */
682             if(!ajStrCmpC(ajFeatGetType(gf), "source"))
683                 continue;
684 
685             /* write out the feature details */
686             ajFmtPrintS(&tmp, "*%s %S %d-%d",
687                                typefeat, ajFeatGetType(gf),
688                                ajFeatGetStart(gf), ajFeatGetEnd(gf));
689             diffseq_AddTags(&tmp, gf, ajTrue);
690             ajFeatTagAddSS(rf, NULL,  tmp);
691 
692         }
693         ajListIterDel(&iter) ;
694     }
695 
696 
697     ajStrDel(&tmp);
698 
699     return;
700 }
701 
702 
703 
704 
705 /* @funcstatic diffseq_AddTags *********************************************
706 **
707 ** Appends feature tag values to a string in a simple format.
708 ** Don't write out the translation - is it often far too long!
709 **
710 ** @param [u] strval [AjPStr*] String
711 ** @param [r] feat [const AjPFeature] Feature to be processed
712 ** @param [r] values [AjBool] display values of tags
713 **
714 ** @return [void]
715 ** @@
716 ******************************************************************************/
717 
diffseq_AddTags(AjPStr * strval,const AjPFeature feat,AjBool values)718 static void diffseq_AddTags(AjPStr* strval,
719 			    const AjPFeature feat, AjBool values)
720 {
721     AjIList titer;                        /* iterator for taglist */
722     AjPStr tagnam = NULL;
723     AjPStr tagval = NULL;
724 
725     /* iterate through the tags and test for match to patterns */
726 
727     titer = ajFeatTagIter(feat);
728     while(ajFeatTagval(titer, &tagnam, &tagval))
729         /* don't display the translation tag - it is far too long :-) */
730         if(!ajStrMatchC(tagnam, "translation"))
731         {
732             if(values == ajTrue)
733                 ajFmtPrintAppS(strval, " %S='%S'", tagnam, tagval);
734             else
735                 ajFmtPrintAppS(strval, " %S", tagnam);
736         }
737 
738     ajListIterDel(&titer);
739     ajStrDel(&tagnam);
740     ajStrDel(&tagval);
741 
742     return;
743 }
744 
745 
746 
747 
748 /* @funcstatic diffseq_DiffList ************************************************
749 **
750 ** Converts a list of matching regions into a list of differences
751 ** The positions are held in human coordinates (starting at 1)
752 ** rather than computer coordinates (starting at 0) because
753 ** there will be a lot of comparing/writing to feature tables
754 ** which use human coordinates.
755 ** over1start is returned with value '-1' if no matching regions were found.
756 **
757 ** @param [r] matchlist [const AjPList] List of minimal non-overlapping matches
758 ** @param [u] difflist [AjPList] Resulting list of differences
759 ** @param [r] global [AjBool] ajTrue if we want the differences at the ends
760 ** @param [r] seq1 [const AjPSeq] sequence 1
761 ** @param [r] seq2 [const AjPSeq] sequence 2
762 ** @param [w] over1start [ajint *] start of overlap region in sequence1
763 ** @param [w] over1end [ajint *] end of overlap region in sequence1
764 ** @param [w] over2start [ajint *] start of overlap region in sequence2
765 ** @param [w] over2end [ajint *] end of overlap region in sequence2
766 ** @return [void]
767 ** @@
768 ******************************************************************************/
769 
diffseq_DiffList(const AjPList matchlist,AjPList difflist,AjBool global,const AjPSeq seq1,const AjPSeq seq2,ajint * over1start,ajint * over1end,ajint * over2start,ajint * over2end)770 static void diffseq_DiffList(const AjPList matchlist, AjPList difflist,
771                              AjBool global,
772                              const AjPSeq seq1, const AjPSeq seq2,
773                              ajint *over1start, ajint *over1end,
774                              ajint *over2start, ajint *over2end)
775 {
776 
777     AjIList iter    = NULL;
778     ajint misstart1 = -1;                /* start of mismatch region in seq1 */
779     ajint misstart2 = -1;                /* start of mismatch region in seq2 */
780     ajint misend1;
781     ajint misend2;                        /* end of mismatch region */
782     PosPDiff diff = NULL;                /* Difference object */
783     ajint i;
784     ajint j;
785     const char *seqc1;
786     const char *seqc2;
787     AjBool rev1 = ajFalse;
788     AjBool rev2 = ajFalse;
789 
790     seqc1 = ajSeqGetSeqC(seq1);
791     seqc2 = ajSeqGetSeqC(seq2);
792 
793     rev1 = ajSeqIsReversed(seq1);
794     rev2 = ajSeqIsReversed(seq2);
795 
796     *over1start = -1;                        /* flag for no matches found */
797 
798     iter = ajListIterNewread(matchlist);
799     while(!ajListIterDone(iter))
800     {
801         EmbPWordMatch p =(EmbPWordMatch) ajListIterGet(iter) ;
802 
803         ajDebug("diffseq match len: %d seq1 %u seq2 %u\n",
804                 p->length, p->seq1start, p->seq2start);
805 
806         misend1 = p->seq1start;
807         misend2 = p->seq2start;
808 
809         if (misstart1 == -1)        /* this is the first iteration */
810         {
811             /* note the end of the overlap (start of first match) */
812             *over1start = p->seq1start + 1;
813             *over2start = p->seq2start + 1;
814 
815             /* add difference at the start, if required */
816             if(global &&                /* we want the global differences */
817               (p->seq1start != 0 ||
818                p->seq2start != 0))        /* no match at pos 1,1 */
819             {
820                 diff = diffseq_PosPDiffNew();
821                 /*
822                 ** look for matches of less than the size of a word that
823                 ** will have been missed by the word-match routines, but which
824                 ** look weird if they are described as differences in
825                 ** the output.
826                 */
827                 diff->Start1 = 1;
828                 diff->Start2 = 1;
829                 diff->End1 = misend1;
830                 diff->End2 = misend2;
831                 diff->Len1 = diff->End1 - diff->Start1 + 1;
832                 diff->Len2 = diff->End2 - diff->Start2 + 1;
833                 diff->Rev1 = rev1;
834                 diff->Rev2 = rev2;
835 
836                 /*
837                 ** If there are mismatches on both sequences, see if we can
838                 ** make a small match at the starts to tidy things up a bit.
839                 ** In other words, we increase the start of the mismatch if
840                 ** there are any matching bases there.
841                 */
842                 for (i=0;
843                      diff->Len1 && diff->Len2 &&
844                      tolower((ajint)seqc1[i]) == tolower((ajint)seqc2[i]); i++)
845                 {
846                     diff->Len1--;
847                     diff->Len2--;
848                     diff->Start1++;
849                     diff->Start2++;
850                 }
851 
852                 /* add node to the end of the list */
853                 ajListPushAppend(difflist, diff);
854             }
855         }
856         else    /* this is a mismatch between two matches */
857         {
858             diff = diffseq_PosPDiffNew();
859             diff->Start1 = misstart1;
860             diff->Start2 = misstart2;
861             diff->End1 = misend1;
862             diff->End2 = misend2;
863             diff->Len1 = diff->End1 - diff->Start1 + 1;
864             diff->Len2 = diff->End2 - diff->Start2 + 1;
865             diff->Rev1 = rev1;
866             diff->Rev2 = rev2;
867 
868             ajDebug("diffseq diff seq1 %d..%d  (%d) seq2: %d..%d  (%d)\n",
869                     diff->Start1, diff->End1, diff->Len1,
870                     diff->Start2, diff->End2, diff->Len2);
871 
872             /* add node to the end of the list */
873             ajListPushAppend(difflist, diff);
874         }
875 
876         /* note the start position of the next mismatch */
877         misstart1 = p->seq1start + p->length + 1;
878         misstart2 = p->seq2start + p->length + 1;
879     }
880 
881     /* note the end of the overlap (end of last match) */
882     *over1end = misstart1-1;
883     *over2end = misstart2-1;
884 
885     /* add difference at the end, if required */
886     if(global &&                        /* we want the global differences */
887       (misstart1 <= (ajint) ajSeqGetLen(seq1) ||   /* no match at the end */
888        misstart2 <= (ajint) ajSeqGetLen(seq2)))
889     {
890         diff = diffseq_PosPDiffNew();
891         diff->Start1 = misstart1;
892         diff->Start2 = misstart2;
893         diff->End1 = ajSeqGetLen(seq1);
894         diff->End2 = ajSeqGetLen(seq2);
895         diff->Len1 = diff->End1 - diff->Start1 + 1;
896         diff->Len2 = diff->End2 - diff->Start2 + 1;
897         diff->Rev1 = rev1;
898         diff->Rev2 = rev2;
899 
900         /*
901         ** If there are mismatches on both sequences, see if we can
902         ** make a small match at the ends to tidy things up a bit.
903         ** In other words, we reduce the end of the mismatch if there
904         ** are any matching bases there.
905         */
906         for (i=ajSeqGetLen(seq1), j=ajSeqGetLen(seq2);
907              diff->Len1 && diff->Len2 &&
908              tolower((ajint)seqc1[i-1]) == tolower((ajint)seqc2[j-1]);
909 	     i--, j--)
910         {
911             diff->Len1--;
912             diff->Len2--;
913             diff->End1--;
914             diff->End2--;
915         }
916         /* add node to the end of the list */
917         ajListPushAppend(difflist, diff);
918     }
919 
920     ajListIterDel(&iter);
921 
922     return;
923 }
924 
925 
926 
927 
928 /* @funcstatic diffseq_PosPDiffNew ********************************************
929 **
930 ** Constructor for an empty PosPDiff object
931 **
932 ** @return [PosPDiff] Diference object
933 ** @category new [PosPDiff] Constructor
934 ** @@
935 ******************************************************************************/
936 
diffseq_PosPDiffNew(void)937 static PosPDiff diffseq_PosPDiffNew(void)
938 {
939     PosPDiff pthis;
940     AJNEW0(pthis);
941 
942     return pthis;
943 }
944 
945 
946 
947 
948 /* @funcstatic diffseq_PosPDiffDel ********************************************
949 **
950 ** Destructor for a PosPDiff object for use with ajListMap
951 **
952 ** @param [r] x [void**] Undocumented
953 ** @param [r] cl [void*] Undocumented
954 ** @return [void]
955 ** @@
956 ******************************************************************************/
957 
diffseq_PosPDiffDel(void ** x,void * cl)958 static void diffseq_PosPDiffDel(void **x, void *cl)
959 {
960     PosPDiff thys;
961     thys = (PosPDiff)*x;
962 
963     (void) cl;				/* make it used */
964 
965     AJFREE(thys);
966 
967     return;
968 }
969 
970 
971 
972 
973 /* @funcstatic diffseq_FeatSetCDSFrame ****************************************
974 **
975 ** The Feature object in the ajfeat library has a field 'Frame' but it is
976 ** often left as '0' (unknown) in CDS features. This routine fixes it.
977 ** This routine assumes that the CDS features within a join()
978 ** are sorted by start position.
979 **
980 ** Frame value of 0 is unknown, values 1,2,3 are equal to GFF phases 0,1,2
981 **
982 ** @param [u] ftab [AjPFeattable] Feature table
983 ** @return [void]
984 ** @@
985 ******************************************************************************/
986 
diffseq_FeatSetCDSFrame(AjPFeattable ftab)987 static void diffseq_FeatSetCDSFrame(AjPFeattable ftab)
988 {
989 
990     AjIList iter      = NULL;        /* feature table iterator */
991     AjIList titer     = NULL;        /* feature tags iterator */
992     AjPFeature gf     = NULL;
993     ajint phase = 0;                /* translation phase, 0,1 or 2 */
994     ajint prevstart = 0;        /* start and end of previous CDS in table */
995     ajint prevend = 0;
996     AjBool prevparent;                /* flag true if prev CDS was a parent */
997     AjBool unsure;                /* true if we are unsure of the phase */
998     AjPStr tagnam = NULL;/* name and value of tags of the feature */
999     AjPStr tagval = NULL;
1000 
1001     unsure = ajFalse;
1002 
1003     if(!ftab)
1004         return;
1005 
1006     if(!ftab->Features)
1007         return;
1008 
1009     iter = ajListIterNewread(ftab->Features);
1010     while(!ajListIterDone(iter))
1011     {
1012         gf = ajListIterGet(iter);
1013 
1014         /* is this a CDS feature? */
1015         if (ajStrCmpC(ajFeatGetType(gf), "CDS"))
1016             continue;
1017 
1018         /* is this a forward sense CDS? */
1019         if(gf->Strand == '+')
1020         {
1021 
1022             /* is this the start of a new group of CDS in a multi-exon gene? */
1023             /* reset the phase to zero, or continue it accordingly */
1024             phase = 0;
1025             unsure = ajFalse;
1026 
1027             /* are we unsure about the start of this CDS? */
1028             if ((gf->Flags & AJFEATFLAG_START_BEFORE_SEQ) ||
1029                 (gf->Flags & AJFEATFLAG_START_UNSURE) ||
1030                 (gf->Flags & AJFEATFLAG_START_TWO))
1031                 unsure = ajTrue;
1032 
1033             /* check to see if the /codon_start tag is set on this feature */
1034 
1035             /* check to see if the /codon_start tag is set on this feature */
1036             titer = ajFeatTagIter(gf);
1037             while(ajFeatTagval(titer, &tagnam, &tagval))
1038                 if(!ajStrCmpC(tagnam, "codon_start"))
1039                 {
1040                     ajStrToInt(tagval, &phase);
1041                     phase--;
1042                     unsure = ajFalse;
1043                 }
1044             ajListIterDel(&titer);
1045 
1046 
1047             /* If we are sure, set the Frame field in the feature
1048             ** object.
1049             */
1050             if (unsure)
1051                 gf->Frame = 0;
1052             else
1053                 if (gf->Frame == 0)
1054                     gf->Frame = phase+1;
1055         }
1056 
1057         /* are we unsure about the end of this CDS? */
1058         if ((gf->Flags & AJFEATFLAG_END_AFTER_SEQ) ||
1059             (gf->Flags & AJFEATFLAG_END_UNSURE) ||
1060             (gf->Flags & AJFEATFLAG_END_TWO))
1061             unsure = ajTrue;
1062 
1063         /* remember the start/end of this feature */
1064         prevstart = ajFeatGetStart(gf);
1065         prevend = ajFeatGetEnd(gf);
1066     }
1067 
1068     ajListIterDel(&iter);
1069 
1070     /*
1071     ** Go back up through list filling in the frame of the reverse sense
1072     ** CDS features.
1073     */
1074     unsure = ajFalse;
1075     prevparent = ajTrue;
1076     iter = ajListIterNewBack(ftab->Features);
1077     while(!ajListIterDoneBack(iter))
1078     {
1079         gf = ajListIterGetBack(iter);
1080 
1081         /* is this a CDS feature? */
1082         if (ajStrCmpC(ajFeatGetType(gf), "CDS"))
1083             continue;
1084 
1085         /* is this a reverse sense CDS? */
1086         if(gf->Strand == '-')
1087         {
1088             /* is this a new CDS group? - set the phase */
1089             if(prevparent)
1090             {
1091                 phase = 0;
1092                 unsure = ajFalse;
1093             }
1094             else
1095             {
1096                 phase += (prevend + 1 - prevstart) % 3;
1097                 phase %= 3;
1098             }
1099 
1100             /* are we unsure about the end of this CDS? */
1101             if ((gf->Flags & AJFEATFLAG_END_AFTER_SEQ) ||
1102                 (gf->Flags & AJFEATFLAG_END_UNSURE) ||
1103                 (gf->Flags & AJFEATFLAG_END_TWO))
1104                 unsure = ajTrue;
1105 
1106             /* check to see if the /codon_start tag is set on this feature */
1107             titer = ajFeatTagIter(gf);
1108             while(ajFeatTagval(titer, &tagnam, &tagval))
1109                 if(!ajStrCmpC(tagnam, "codon_start"))
1110                 {
1111                     ajStrToInt(tagval, &phase);
1112                     phase--;
1113                     unsure = ajFalse;
1114                 }
1115             ajListIterDel(&titer);
1116 
1117             /*
1118             ** If we are sure, set the Frame field in the feature
1119             ** object.
1120             */
1121             if (unsure)
1122                 gf->Frame = 0;
1123             else
1124                 if (gf->Frame == 0)
1125                     gf->Frame = phase+1;
1126         }
1127 
1128         /* remember the parental status of this CDS */
1129         prevparent = ajTrue;
1130 
1131         /* are we unsure about the start of this CDS? */
1132         if ((gf->Flags & AJFEATFLAG_START_BEFORE_SEQ) ||
1133             (gf->Flags & AJFEATFLAG_START_UNSURE) ||
1134             (gf->Flags & AJFEATFLAG_START_TWO))
1135             unsure = ajTrue;
1136 
1137         /* remember the start/end of this feature */
1138         prevstart = ajFeatGetStart(gf);
1139         prevend = ajFeatGetEnd(gf);
1140     }
1141 
1142     ajListIterDel(&iter);
1143     ajListIterDel(&titer);
1144     ajStrDel(&tagnam);
1145     ajStrDel(&tagval);
1146 
1147     return;
1148 }
1149