1 /* @source sirna application
2 **
3 ** Finds siRNA duplexes in mRNA
4 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
5 ** @@
6 **
7 ** This program is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU General Public License
9 ** as published by the Free Software Foundation; either version 2
10 ** of the License, or (at your option) any later version.
11 **
12 ** This program is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 ** GNU General Public License for more details.
16 **
17 ** You should have received a copy of the GNU General Public License
18 ** along with this program; if not, write to the Free Software
19 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
20 ******************************************************************************/
21 
22 
23 #include "emboss.h"
24 #include <math.h>
25 #include <stdlib.h>
26 
27 #define NOT_WANTED -100
28 
29 
30 
31 
32 static void sirna_report(AjPReport report, const AjPSeq seq,
33 			 AjBool poliii, AjBool aa, AjBool tt, AjBool polybase,
34 			 AjBool context, AjPSeqout seqout);
35 static ajint sirna_begin(const AjPSeq seq, AjPReport report, AjBool poliii,
36 			  AjBool aa, AjBool tt, AjBool polybase,
37 			  AjBool context);
38 static void sirna_new_value(AjPList list, ajint pos, ajint score,
39 			    ajint GCcount);
40 static int sirna_compare_score(const void* v1, const void* v2);
41 static void sirna_output(const AjPList list,
42 			 AjPFeattable TabRpt, const AjPSeq seq,
43 			 AjBool context, AjPSeqout seqout);
44 
45 
46 
47 
48 /* @datastatic PValue *********************************************************
49 **
50 ** structure for position, score and GC count at the start of a window
51 **
52 ** @alias SValue
53 ** @alias OValue
54 **
55 ** @attr pos [ajint] Position
56 ** @attr GCcount [ajint] GC count
57 ** @attr score [ajint] Score
58 ******************************************************************************/
59 
60 typedef struct SValue
61 {
62     ajint pos;
63     ajint GCcount;
64     ajint score;
65 } OValue;
66 #define PValue OValue*
67 
68 
69 
70 
71 /* @prog sirna ****************************************************************
72 **
73 ** Finds siRNA duplexes in mRNA
74 **
75 ******************************************************************************/
76 
main(int argc,char ** argv)77 int main(int argc, char **argv)
78 {
79     AjPSeqall seqall;
80     AjPReport report = NULL;
81     AjBool poliii;
82     AjBool aa;
83     AjBool tt;
84     AjBool polybase;
85     AjBool context;
86     AjPSeqout seqout;
87 
88     AjPSeq seq = NULL;
89 
90 
91     embInit("sirna",argc,argv);
92 
93     seqall    = ajAcdGetSeqall("sequence");
94     poliii    = ajAcdGetBoolean("poliii");
95     aa        = ajAcdGetBoolean("aa");
96     tt        = ajAcdGetBoolean("tt");
97     polybase  = ajAcdGetBoolean("polybase");
98     context   = ajAcdGetBoolean("context");
99     report    = ajAcdGetReport("outfile");
100     seqout    = ajAcdGetSeqoutall("outseq");
101 
102     while(ajSeqallNext(seqall, &seq))
103 	sirna_report(report, seq, poliii, aa, tt, polybase, context, seqout);
104 
105     ajReportSetSeqstats(report, seqall);
106 
107     ajSeqallDel(&seqall);
108     ajSeqDel(&seq);
109     ajReportClose(report);
110     ajSeqoutDel(&seqout);
111     ajReportDel(&report);
112 
113     embExit();
114 
115     return 0;
116 }
117 
118 
119 
120 
121 /* @funcstatic sirna_report ***************************************************
122 **
123 ** finds regions that can be used as siRNAs
124 **
125 ** @param [u] report [AjPReport] Undocumented
126 ** @param [r] seq [const AjPSeq] Sequence
127 ** @param [r] poliii [AjBool] True if want poliii expressable probes
128 ** @param [r] aa [AjBool] True if want AA at start of region
129 ** @param [r] tt [AjBool] True if want TT at end of region
130 ** @param [r] polybase [AjBool] True if we allow 4-mers of any base
131 ** @param [r] context [AjBool] True if we wish to show the 2 bases before
132 **                              the probe region
133 ** @param [u] seqout [AjPSeqout] Ouput sequence object
134 ** @return [void]
135 ** @@
136 ******************************************************************************/
137 
138 
sirna_report(AjPReport report,const AjPSeq seq,AjBool poliii,AjBool aa,AjBool tt,AjBool polybase,AjBool context,AjPSeqout seqout)139 static void sirna_report(AjPReport report, const AjPSeq seq, AjBool poliii,
140 			 AjBool aa, AjBool tt, AjBool polybase,
141 			 AjBool context, AjPSeqout seqout)
142 {
143     AjPFeattable TabRpt=NULL;	 /* output feature table for report */
144     ajint position;
145     ajint i;
146     ajint j;
147     ajint window_len = 23; /* the length of the required siRNA region */
148     ajint shift = 1;	       /* want to evaluate at each position */
149     ajint count_gc;	     /* number of G + C bases in the window */
150     char c;
151     const char *cseq;
152     ajint aaaa_count;
153     ajint cccc_count;
154     ajint gggg_count;
155     ajint tttt_count;
156     AjPList scorelist;
157     ajint score;
158     AjPStr newstr = NULL;
159 
160     /* scores */
161     ajint CDS_50  = NOT_WANTED;
162     ajint CDS_100 = -2;
163     ajint A_start = +3;
164     ajint TT_end  = +1;
165     ajint GGGG    = NOT_WANTED;
166 
167     ajint GC_5  = NOT_WANTED;
168     ajint GC_6  = 0;
169     ajint GC_7  = +2;
170     ajint GC_8  = +4;
171     ajint GC_9  = +5;
172     ajint GC_10 = +6;
173     ajint GC_11 = +5;
174     ajint GC_12 = +4;
175     ajint GC_13 = +2;
176     ajint GC_14 = +0;
177     ajint GC_15 = NOT_WANTED;
178 
179     /* heavy penalty for position 2 not 'A' */
180     ajint POS_2_NOT_A = NOT_WANTED;
181 
182     /*
183     ** if want Pol III expressable probes, then have a heavy
184     ** penalty for not NAR(N17)YNN
185     */
186     ajint NOT_POLIII = NOT_WANTED;
187 
188     ajint e;
189     ajint CDS_begin;
190 
191     /* initalise report feature table */
192     TabRpt = ajFeattableNewSeq(seq);
193 
194     /* get start of CDS region */
195     CDS_begin = sirna_begin(seq, report, poliii, aa, tt, polybase, context);
196 
197     cseq = ajSeqGetSeqC(seq);
198 
199     /*
200     ** Using a list holding info on position, %GC, score. Sort by
201     ** score and write the top N over the threshold to the report file
202     */
203     scorelist = ajListNew();
204 
205     /*
206     ** initialise the GC count for the first window
207     ** we count the %GC in the 20 bases from position 2 to 21 of the 23
208     */
209     count_gc = 0;
210     for (i=CDS_begin+1; i < CDS_begin+window_len-2; i++)
211     {
212 	c = cseq[i];
213 	if(tolower((int)c) == 'g' || tolower((int)c) == 'c')
214 	    count_gc++;
215     }
216 
217     e = (ajSeqGetEnd(seq) - window_len); /* last position a window can start */
218     for(position=CDS_begin; position < e; position+=shift)
219     {
220 	/* initialise score */
221 	score = 0;
222 
223 	/* penalty for being too close to the CDS start */
224 	if(CDS_begin != 0)
225 	{
226 	    /* check for a CDS start site */
227 	    if(position < CDS_begin + 50)
228 		score += CDS_50;
229 	    else if(position < CDS_begin + 100)
230 		score += CDS_100;
231 	}
232 
233 	/* set score for GC bases */
234 	if(count_gc <= 5)
235 	    score += GC_5;
236 	else if(count_gc == 6)
237 	    score += GC_6;
238 	else if(count_gc == 7)
239 	    score += GC_7;
240 	else if(count_gc == 8)
241 	    score += GC_8;
242 	else if(count_gc == 9)
243 	    score += GC_9;
244 	else if(count_gc == 10)
245 	    score += GC_10;
246 	else if(count_gc == 11)
247 	    score += GC_11;
248 	else if(count_gc == 12)
249 	    score += GC_12;
250 	else if(count_gc == 13)
251 	    score += GC_13;
252 	else if(count_gc == 14)
253 	    score += GC_14;
254 	else if(count_gc >= 15)
255 	    score += GC_15;
256 
257 
258 	ajDebug("%d) GC score = %d\n", position, score);
259 
260 	/* check for AA at start */
261 	if(tolower((int)cseq[position]) == 'a')
262 	{
263 	    score += A_start;
264 	    ajDebug("Found A at start\n");
265 	}
266 	else if(aa)
267 	    score += NOT_WANTED;
268 
269 	/* check for TT at end */
270 	if(((tolower((int)cseq[position+window_len-1]) == 't') &&
271 	    (tolower((int)cseq[position+window_len-2]) == 't')) ||
272 	   ((tolower((int)cseq[position+window_len-1]) == 'u') &&
273 	    (tolower((int)cseq[position+window_len-2]) == 'u')))
274 	{
275 	    score += TT_end;
276 	    ajDebug("Found TT at end\n");
277 	}
278 	else if(tt)
279 	    score += NOT_WANTED;
280 
281 
282 	/* check for GGGG and other 4-mers */
283 	aaaa_count = 0;
284 	cccc_count = 0;
285 	gggg_count = 0;
286 	tttt_count = 0;
287 	for(j=position; j < position+window_len; j++)
288 	{
289 	    c = cseq[j];
290 	    if(tolower((int)c) != 'a')
291 		aaaa_count = 0;
292 	    else
293 		aaaa_count++;
294 
295 	    if(tolower((int)c) != 'c')
296 		cccc_count = 0;
297 	    else
298 		cccc_count++;
299 
300 	    if(tolower((int)c) != 'g')
301 		gggg_count = 0;
302 	    else
303 		gggg_count++;
304 
305 	    if(tolower((int)c) != 't' &&
306 	       tolower((int)c) != 'u')
307 		tttt_count = 0;
308 	    else
309 		tttt_count++;
310 
311 	    /* always bomb out if we find GGGG */
312 	    if(gggg_count >= 4)
313 	    {
314 		score += GGGG;
315 		ajDebug("Found a GGGG\n");
316 		break;
317 	    }
318 
319 	    /* bomb out if we don't want any 4-mer and we find one */
320 	    if(!polybase)
321 	    {
322 		if(aaaa_count >= 4)
323 		{
324 		    score += NOT_WANTED;
325 		    break;
326 		}
327 
328 		if(cccc_count >= 4)
329 		{
330 		    score += NOT_WANTED;
331 		    break;
332 		}
333 
334 		if(tttt_count >= 4) {
335 		    score += NOT_WANTED;
336 		    break;
337 		}
338 	    }
339 	}
340 
341 	/* check to see if the second base is an 'A' */
342 	if(tolower((int)cseq[position+1]) != 'a')
343 	    score += POS_2_NOT_A;
344 
345 	/*
346 	** if want probes that can be expressed from Pol III
347 	** expression vectors
348 	** add a penalty if they do not have the correct purine/pyrimidine
349 	** pattern
350 	*/
351 	if(poliii &&
352 	   ((tolower((int)cseq[position+2]) != 'a' &&
353 	     tolower((int)cseq[position+2]) != 'g') ||
354 	    (tolower((int)cseq[position+window_len-3]) != 'c' &&
355 	     tolower((int)cseq[position+window_len-3]) != 'u' &&
356 	     tolower((int)cseq[position+window_len-3]) != 't')))
357 	{
358 	    score += NOT_POLIII;
359 	    ajDebug("Not a PolIII region\n");
360 	}
361 
362 	/* save the score and other details */
363 	if(score > 0)
364 	    sirna_new_value(scorelist, position, score, count_gc);
365 
366 	/*
367 	** update count of GC bases
368 	** dropping this base off the 5' end of the window
369 	*/
370 	c = cseq[position+1];
371 	ajDebug("%d) score = %d\n", position, score);
372 	ajDebug("** base = %c\n", c);
373 
374 	if(tolower((int)c) == 'g' || tolower((int)c) == 'c')
375 	    count_gc--;
376 
377 	/* new base shifted onto 3' end of window -2 */
378 	c = cseq[position+window_len-2];
379 	if(tolower((int)c) == 'g' || tolower((int)c) == 'c')
380 	    count_gc++;
381 
382 	ajDebug("Next GC count=%d\n", count_gc);
383     }
384 
385     /* now sort the list by score then position */
386     ajListSort(scorelist, sirna_compare_score);
387 
388 
389 
390     /* now pop off the positive scores and write them to the report object */
391     sirna_output(scorelist, TabRpt, seq, context, seqout);
392     ajReportWrite(report, TabRpt, seq);
393 
394     ajSeqoutClose(seqout);
395     ajListFreeData(&scorelist);
396     ajFeattableDel(&TabRpt);
397     ajStrDel(&newstr);
398 
399     return;
400 }
401 
402 
403 
404 
405 /* @funcstatic sirna_begin ***************************************************
406 **
407 ** Finds the start position of the CDS  or uses -sbegin if no feature table
408 ** Adds some explanation to the report header.
409 **
410 ** @param [r] seq [const AjPSeq] Sequence
411 ** @param [u] report [AjPReport] Report object
412 ** @param [r] poliii [AjBool] True if want poliii expressable probes
413 ** @param [r] aa [AjBool] True if want AA at start of region
414 ** @param [r] tt [AjBool] True if want TT at end of region
415 ** @param [r] polybase [AjBool] True if we allow 4-mers of any base
416 ** @param [r] context [AjBool] True if we wish to show the 2 bases
417 **                           before the probe region
418 ** @return [ajint] start of CDS region (using positions starting from 0)
419 ** @@
420 ******************************************************************************/
421 
sirna_begin(const AjPSeq seq,AjPReport report,AjBool poliii,AjBool aa,AjBool tt,AjBool polybase,AjBool context)422 static ajint sirna_begin(const AjPSeq seq, AjPReport report, AjBool poliii,
423 			 AjBool aa, AjBool tt, AjBool polybase,
424 			 AjBool context)
425 {
426     AjPFeattable featab = NULL;	    /* input sequence feature table */
427     ajint begin = 0;
428     AjPFeature gf = NULL;
429     AjIList iter  = NULL;
430     AjPStr type   = NULL;
431     AjPStr head   = NULL;
432     AjPStr head2  = NULL;
433 
434     /* get input sequence features */
435     featab = ajSeqGetFeatCopy(seq);
436     type   = ajStrNew();
437     ajStrAssignC(&type, "CDS");
438     head  = ajStrNew();
439     head2 = ajStrNew();
440 
441     ajDebug("sirna_begin()\n");
442 
443     /* say something about the options in the report header */
444     if(poliii)
445 	ajStrAppendC(&head, "Selecting only regions suitable for "
446 		  "PolIII expression vectors\n");
447 
448     if(aa)
449 	ajStrAppendC(&head, "Selecting only siRNA regions starting with 'AA'\n");
450 
451     if(tt)
452 	ajStrAppendC(&head, "Selecting only siRNA regions ending with 'TT'\n");
453 
454     if(!polybase)
455 	ajStrAppendC(&head, "Selecting only siRNA regions with no 4-mers "
456 		  "of any base\n");
457 
458     if(context)
459 	ajStrAppendC(&head, "The forward sense sequence shows the first 2 "
460 		  "bases of\nthe 23 base region in brackets, this "
461 		  "should be ignored\nwhen ordering siRNA probes.\n");
462 
463     /* are there any features - find the first CDS feature */
464     if(ajFeattableGetSize(featab))
465     {
466 	iter = ajListIterNewread(featab->Features);
467 	while(!ajListIterDone(iter))
468 	{
469 	    gf = ajListIterGet(iter);
470 	    if(ajStrMatchS(type, ajFeatGetType(gf)))
471 	    {
472 		/* is the feature 'CDS'? */
473 		begin = ajFeatGetStart(gf)-1;
474 		ajFmtPrintS(&head2, "CDS region found in feature table "
475 			    "starting at %d",
476 			    begin+1);
477 		/* ajDebug("Found a CDS at %d\n", begin);*/
478 		break;
479 	    }
480 	}
481 	ajListIterDel(&iter);
482     }
483 
484     /* if didn't find a CDS, assume -sbegin is the CDS */
485     if(begin == 0)
486     {
487 	begin = ajSeqGetBegin(seq)-1;
488 	if(begin == 0)
489 	{
490 	    ajDebug("begin = 0\n");
491 	    ajFmtPrintS(&head2, "%s%s%s%s",
492 			"No CDS region was found in the feature table.\n",
493 			"No CDS region was indicated by setting -sbegin.\n",
494 			"There will therefore be no penalty for siRNAs found ",
495 			"in the first 100 bases.");
496 	}
497 	else
498 	{
499 	    ajDebug("begin != 0\n");
500 	    ajFmtPrintS(&head2, "%s%s%s%d",
501 			"No CDS region was found in the feature table.\n",
502 			"The CDS region is assumed to start at the position ",
503 			"given by -sbegin: ",
504 			begin+1);
505 	}
506 
507 	ajDebug("CDS specified at %d\n", begin);
508     }
509 
510     ajStrAppendS(&head, head2);
511     ajReportSetHeaderS(report, head);
512 
513     ajDebug("sirna_begin begin=%d\n", begin);
514 
515     ajFeattableDel(&featab);
516     ajStrDel(&type);
517     ajStrDel(&head);
518     ajStrDel(&head2);
519 
520     return begin;
521 }
522 
523 
524 
525 
526 /* @funcstatic sirna_new_value ***********************************************
527 **
528 ** creates a PValue object and adds it to the list
529 **
530 ** @param [u] list [AjPList] list to add value to
531 ** @param [r] pos [ajint] position in sequence
532 ** @param [r] score [ajint] score
533 ** @param [r] GCcount [ajint] number of GC bases
534 ** @return [void]
535 ** @@
536 ******************************************************************************/
537 
sirna_new_value(AjPList list,ajint pos,ajint score,ajint GCcount)538 static void sirna_new_value(AjPList list, ajint pos, ajint score,
539 			    ajint GCcount)
540 {
541 
542     PValue value;
543 
544     AJNEW0(value);
545     value->pos     = pos;
546     value->GCcount = GCcount;
547     value->score   = score;
548 
549     ajListPushAppend(list, value);
550 
551     return;
552 }
553 
554 
555 
556 
557 /* @funcstatic sirna_compare_score *******************************************
558 **
559 ** Compares the scores of two positions for use in sorting
560 **
561 ** @param [r] v1 [const void*] First structure
562 ** @param [r] v2 [const void*] Second structure
563 ** @return [int] -1 if first value should sort before second, +1 if the
564 **         second value should sort first. 0 if they are identical
565 **         in score
566 ** @@
567 ******************************************************************************/
568 
sirna_compare_score(const void * v1,const void * v2)569 static int sirna_compare_score(const void* v1, const void* v2)
570 {
571     const PValue s1;
572     const PValue s2;
573 
574     s1 = *(PValue const *)v1;
575     s2 = *(PValue const *)v2;
576 
577 
578     if(s1->score > s2->score)
579     	return -1;
580     else if(s1->score < s2->score)
581     	return +1;
582 
583     /* sort by position if scores are equal */
584     if(s1->pos < s2->pos)
585 	return -1;
586     else if(s1->pos > s2->pos)
587 	return +1;
588 
589     return 0;
590 }
591 
592 
593 
594 
595 /* @funcstatic sirna_output *********************************************
596 **
597 ** Output the probes
598 **
599 ** @param [r] list [const AjPList] list of PValue structures
600 ** @param [u] TabRpt [AjPFeattable] feature table
601 ** @param [r] seq [const AjPSeq] Sequence
602 ** @param [r] context [AjBool] True if we wish to show the 2 bases before
603 **                             the probe region
604 ** @param [u] seqout [AjPSeqout] Ouput sequence object
605 ** @return [void]
606 ** @@
607 ******************************************************************************/
608 
sirna_output(const AjPList list,AjPFeattable TabRpt,const AjPSeq seq,AjBool context,AjPSeqout seqout)609 static void sirna_output(const AjPList list,
610 			 AjPFeattable TabRpt, const AjPSeq seq,
611 			 AjBool context, AjPSeqout seqout)
612 {
613     AjIList iter;
614     PValue value;
615     AjPFeature gf;
616     AjPStr tmpStr;
617     AjPStr subseq;
618     AjPStr source;
619     AjPStr type;
620     AjPStr name;
621     AjPStr desc;
622     AjPSeq seq23;
623 
624 
625     tmpStr = ajStrNew();
626     subseq = ajStrNew();
627     source = ajStrNewC("siRNA");
628     type   = ajStrNewC("misc_feature");
629     name   = ajStrNew();     /* new name of the 23 base target sequence */
630     desc = ajStrNew();       /* new description of 23 base target sequence */
631     seq23 = ajSeqNewRes(24);   /* 23-base target sequence */
632 
633     /* if no hits then ignore much of this routine */
634     if(ajListGetLength(list))
635     {
636 	iter = ajListIterNewread(list);
637 	while((value = ajListIterGet(iter)) != NULL)
638 	{
639 	    /* ajDebug("(%d) %d %d\n", value->pos+1, value->score,
640 	       value->GCcount); */
641 
642 	    gf = ajFeatNew(TabRpt, source, type, value->pos+1,
643 			   value->pos+23, (float)value->score, '+', 0);
644 
645 	    ajFmtPrintS(&tmpStr, "*gc %4.1f",
646 			((float)value->GCcount*100.0)/20.0);
647 	    ajFeatTagAddSS(gf,  NULL, tmpStr);
648 
649 	    /* get the sequences to order */
650 	    if(context)
651 	    {
652 		/*
653 		** get the first two characters of the sequence before
654 		** the siRNA probe region
655 		*/
656 		ajStrAssignC(&subseq, "(");
657 		ajStrAppendSubS(&subseq, ajSeqGetSeqS(seq),
658 				value->pos, value->pos+1);
659 		ajStrAppendC(&subseq, ")");
660 	    }
661 
662 	    ajStrAppendSubS(&subseq, ajSeqGetSeqS(seq),
663 			    value->pos+2, value->pos+20);
664 	    ajStrFmtUpper(&subseq);
665 	    ajStrExchangeKK(&subseq, 'T', 'U');
666 	    ajStrAppendC(&subseq, "dTdT");
667 	    ajFmtPrintS(&tmpStr, "*forward %S", subseq);
668 	    ajFeatTagAddSS(gf,  NULL, tmpStr);
669 
670 	    ajStrAssignSubS(&subseq, ajSeqGetSeqS(seq),
671 			    value->pos+2, value->pos+20);
672 	    ajStrFmtUpper(&subseq);
673 	    ajSeqstrReverse(&subseq);
674 	    ajStrExchangeKK(&subseq, 'T', 'U');
675 	    ajStrAppendC(&subseq, "dTdT");
676 	    ajFmtPrintS(&tmpStr, "*reverse %S", subseq);
677 	    ajFeatTagAddSS(gf,  NULL, tmpStr);
678 
679 	    /*
680 	    ** now write out the 23 base bit of sequence to the seqout file
681 	    ** get sequence
682 	    */
683 	    ajDebug("Now write sequence file\n");
684 	    ajStrAssignSubS(&subseq, ajSeqGetSeqS(seq),
685 			    value->pos, value->pos+22);
686 	    ajSeqAssignSeqS(seq23, subseq);
687 	    ajSeqSetNuc(seq23);
688 
689 	    /* give it a name */
690 	    ajDebug("Doing name\n");
691 	    ajStrAssignS(&name, ajSeqGetNameS(seq));
692 	    ajStrAppendC(&name, "_");
693 	    ajStrFromInt(&tmpStr, value->pos+1);
694 	    ajStrAppendS(&name, tmpStr);
695 	    ajSeqAssignNameS(seq23, name);
696 
697 	    /* get description */
698 	    ajDebug("Doing description\n");
699 	    ajFmtPrintS(&desc, "%%GC %4.1f Score %d ",
700 			((float)value->GCcount*100.0)/20.0,
701 			value->score);
702 	    ajStrAppendS(&desc, ajSeqGetDescS(seq));
703 	    ajSeqAssignDescS(seq23, desc);
704 
705 	    ajDebug("Write seq23\n");
706 	    ajSeqoutWriteSeq(seqout, seq23);
707 
708 	    /* prepare sequence string for re-use */
709 	    ajStrSetClear(&subseq);
710 	}
711 	ajListIterDel(&iter);
712     }
713 
714     ajStrDel(&tmpStr);
715     ajStrDel(&subseq);
716     ajStrDel(&source);
717     ajStrDel(&type);
718     ajStrDel(&name);
719     ajStrDel(&desc);
720     ajSeqDel(&seq23);
721 
722     return;
723 }
724 
725 
726 
727 
728 /*
729 For the definitive guide to siRNA, see:
730 http://www.mpibpc.gwdg.de/abteilungen/100/105/sirna.html
731 
732 
733 The siRNA user guide (revised October 11, 2002)
734 
735 Selection of siRNA duplexes from the target mRNA sequence
736 
737 Using Drosophila melanogaster lysates (Tuschl et al.  1999), we have
738 systematically analyzed the silencing efficiency of siRNA duplexes as a
739 function of the length of the siRNAs, the length of the overhang and the
740 sequence in the overhang (Elbashir et al.  2001c).  The most efficient
741 silencing was obtained with siRNA duplexes composed of 21-nt sense and
742 21-nt antisense strands, paired in a manner to have a 2-nt 3' overhang.
743 The sequence of the 2-nt 3' overhang makes a small contribution to the
744 specificity of target recognition restricted to the unpaired nucleotide
745 adjacent to the first base pair.  2'-Deoxynucleotides in the 3'
746 overhangs are as efficient as ribonucleotides, but are often cheaper to
747 synthesize and probably more nuclease resistant.  We used to select
748 siRNA sequences with TT in the overhang.
749 
750 The targeted region is selected from a given cDNA sequence beginning 50
751 to 100 nt downstream of the start codon.  Initially, 5' or 3' UTRs and
752 regions nearby the start codon were avoided assuming that UTR-binding
753 proteins and/or translation initiation complexes may interfere with
754 binding of the siRNP or RISC endonuclease complex.  More recently,
755 however, we have targeted 3'-UTRs and have not experienced any problems
756 in knocking down the targeted genes.  In order to design a siRNA duplex,
757 we search for the 23-nt sequence motif AA(N19)TT (N, any nucleotide) and
758 select hits with approx.  50% G/C-content (30% to 70% has also worked in
759 our hands).  If no suitable sequences are found, the search is extended
760 using the motif NA(N21).  The sequence of the sense siRNA corresponds to
761 (N19)TT or N21 (position 3 to 23 of the 23-nt motif), respectively.  In
762 the latter case, we convert the 3' end of the sense siRNA to TT.  The
763 rationale for this sequence conversion is to generate a symmetric duplex
764 with respect to the sequence composition of the sense and antisense 3'
765 overhangs.  The antisense siRNA is synthesized as the complement to
766 position 1 to 21 of the 23-nt motif.  Because position 1 of the 23-nt
767 motif is not recognized sequence-specifically by the antisense siRNA,
768 the 3'-most nucleotide residue of the antisense siRNA, can be chosen
769 deliberately.  However, the penultimate nucleotide of the antisense
770 siRNA (complementary to position 2 of the 23-nt motif) should always be
771 complementary to the targeted sequence.  For simplifying chemical
772 synthesis, we always use TT.  More recently, we preferentially select
773 siRNAs corresponding to the target motif NAR(N17)YNN, where R is purine
774 (A, G) and Y is pyrimidine (C, U).  The respective 21-nt sense and
775 antisense siRNAs therefore begin with a purine nucleotide and can also
776 be expressed from pol III expression vectors without a change in
777 targeting site; expression of RNAs from pol III promoters is only
778 efficient when the first transcribed nucleotide is a purine.
779 
780 We always design siRNAs with symmetric 3' TT overhangs, believing that
781 symmetric 3' overhangs help to ensure that the siRNPs are formed with
782 approximately equal ratios of sense and antisense target RNA-cleaving
783 siRNPs (Elbashir et al.  2001b; Elbashir et al.  2001c).  Please note
784 that the modification of the overhang of the sense sequence of the siRNA
785 duplex is not expected to affect targeted mRNA recognition, as the
786 antisense siRNA strand guides target recognition.  In summary, no matter
787 what you do to your overhangs, siRNAs should still function to a
788 reasonable extent.  However, using TT in the 3' overhang will always
789 help your RNA synthesis company to let you know when you accidentally
790 order a siRNA sequences 3' to 5' rather than in the recommended format
791 of 5' to 3'.  You may think this is funny, but it has happened quite a
792 lot.
793 
794 Compared to antisense or ribozyme technology, the secondary structure of
795 the target mRNA does not appear to have a strong effect on silencing.
796 We say that, because we have already knocked-down more than 20 genes
797 using a single, essentially randomly chosen siRNA duplex (Harborth et
798 al.  2001).  Only 3 siRNA duplexes have been ineffective so far.  In one
799 or two other cases, we have found siRNAs to be inactive because the
800 targeting site contained a single-nucleotide polymorphism.  We were also
801 able to knock-down two genes simultaneously (e.g.  lamin A/C and NuMA)
802 by using equal concentrations of siRNA duplexes.
803 
804 We recommend to blast-search (NCBI database) the selected siRNA sequence
805 against EST libraries to ensure that only one gene is targeted.  In
806 addition, we also recommend to knock-down your gene with two independent
807 siRNA duplexes to control for specificity of the silencing effect.  If
808 selected siRNA duplexes do not function for silencing, please check for
809 sequencing errors of the gene, polymorphisms, and whether your cell line
810 is really from the expected species.  Our initial studies on the
811 specificity of target recognition by siRNA duplexes indicate that a
812 single point mutation located in the paired region of an siRNA duplex is
813 sufficient to abolish target mRNA degradation (Elbashir et al.  2001c).
814 Furthermore, it is unknown if targeting of a gene by two different siRNA
815 duplexes is more effective than using a single siRNA duplex.  We think
816 that the amount of siRNA-associating proteins is limiting for silencing
817 rather than the target accessibility.
818 
819 Sequences of siRNA duplexes used in our studies
820 
821 The sequences of siRNA duplexes described in the Nature paper (Elbashir
822 et al.  2001a):
823 
824 Lamin A/C
825 targeted region (cDNA): 5' AACTGGACTTCCAGAAGAACATC
826            sense siRNA: 5'   CUGGACUUCCAGAAGAACAdTdT
827        antisense siRNA: 5' UGUUCUUCUGGAAGUCCAGdTdT
828 
829 
830 Vimentin
831 targeted region (cDNA): 5' AACTACATCGACAAGGTGCGCTT
832            sense siRNA: 5'   CUACAUCGACAAGGUGCGCdTdT
833        antisense siRNA: 5' GCGCACCUUGUCGAUGUAGdTdT
834 
835 
836 NuMA
837 targeted region (cDNA): 5' AAGGCGTGGCAGGAGAAGTTCTT
838            sense siRNA: 5'   GGCGUGGCAGGAGAAGUUCdTdT
839        antisense siRNA: 5' GAACUUCUCCUGCCACGCCdTdT
840 
841 
842 Lamin B1
843 targeted region (cDNA): 5' AACGCGCTTGGTAGAGGTGGATT
844            sense siRNA: 5'   CGCGCUUGGUAGAGGUGGAdTdT
845        antisense siRNA: 5' UCCACCUCUACCAAGCGCGdTdT
846 
847 
848 GL2 Luciferase
849 targeted region (cDNA): 5' AACGTACGCGGAATACTTCGATT
850            sense siRNA: 5'   CGUACGCGGAAUACUUCGAdTdT
851        antisense siRNA: 5' UCGAAGUAUUCCGCGUACGdTdT
852 
853 ============================================================================
854 
855 siRNA description from http://www.oligoengine.com/
856 30 October 2002
857 
858 siRNA: Background and Applications RNA interference, or RNAi, is a
859 phenomenon in which double stranded RNA (dsRNA) effects silencing of the
860 expression of genes that are highly homologous to either of the RNA
861 strands in the duplex.  Gene silencing in RNAi results from the
862 degradation of mRNA sequences, and the effect has been used to determine
863 the function of many genes in Drosophilia, C.  elegans, and many plant
864 species.
865 
866 The duration of knockdown by siRNA can typically last for 7-10 days, and
867 has been shown to transfer to daughter cells.  Of further note, siRNAs
868 are effective at quantities much lower than alternative gene silencing
869 methodologies, including antisense and ribozyme based strategies.
870 
871 Considerable interest has developed in RNAi lately among life science
872 professionals.  The ability to quickly and easily generate
873 loss-of-function phenotypes in mammalian cells has spawned a new wave of
874 research endeavors; companies are also exploring the potential of siRNA
875 for use in drug development and gene-specific therapeutics.
876 
877 Due to various mechanisms of antiviral response to long dsRNA, RNAi at
878 first proved more difficult to establish in mammalian species.  Then,
879 Tuschl, Elbashir, and others discovered that RNAi can be elicited very
880 effectively by well-defined 21-base duplex RNAs.  When these small
881 interfering RNA, or siRNA, are added in duplex form with a transfection
882 agent to mammalian cell cultures, the 21-base-pair RNA acts in concert
883 with cellular components to silence the gene with sequence homology to
884 one of the siRNA sequences.
885 
886 Strategies for the design of effective siRNA sequences have been
887 recently documented, most notably by Sayda Elbashir, Thomas Tuschl, et
888 al.  Like all of the design applications in the OligoEngine Platform,
889 our siRNA Design Tool transforms these strategies into an automated
890 process that improves accuracy, saves time, and provides valuable
891 feedback to the user.
892 
893 In brief, the Tool analyzes a user's gene sequence and applies various
894 algorithms to generate "candidate" siRNA sequences that are 19
895 nucleotides in length.  Users then select the 2-nt 3' overhang to order
896 two (sense and antisense) 21-nucleotide RNA oligos.
897 
898 Use either:
899 
900 1) Type the accession number of a GenBank mRNA file associated with your
901 gene of interest.  (Hint: for best results, use a GenBank file with
902 complete CDS)
903 
904 2) Copy a complete mRNA sequence from another file and use the "Paste
905 Sequence" button to quickly input your sequence.
906 
907 Next, select the general parameters for siRNA design.  These include:
908 
909 GC Content: 50% is optimal, and the Tool uses a default range of 45-55%.
910 You can set parameters from 30-70%, the max range for potentially
911 effective siRNA.
912 
913 Leader: The tool uses AA as its default, the widely accepted standard
914 for the dinucleotide leader.  CA leaders can also be effective and may
915 be provide a broader range of options, especially when searching for a
916 sequence homologous to two related genes.
917 
918 UTR Search: While we do not recommend siRNA candidates targeting 5' or
919 3' UTR regions, the Tool gives users the option to design in UTRs (if
920 UTR sequences are available).
921 
922 ORF length: For paste-in sequences, the Tool first predicts the ORF to
923 ensure siRNA design within the coding region.  Setting a minimum ORF
924 length helps to make an accurate prediction and account for "false" stop
925 codons inside the CDS.
926 
927 
928 
929 Choose the 3' overhang: Click to select dTdT (default setting and
930 generally recommended choice) or UU.  Contact us for other custom
931 overhang options if required.
932 
933 BLAST your siRNA: click the BLAST button to check for sequence homology
934 against our local NCBI genome databases.  When complete, the button will
935 turn green- click it to view the BLAST results in a separate screen
936 (reports on any siRNA you purchase are saved for your future review).
937 
938 
939 As mentioned, the siRNA Design Tool is based on guidelines developed by
940 leading researchers in the study of siRNA and RNAi: chiefly, Sayda
941 Elbashir, Thomas Tuschl, and their coworkers.  These in turn were
942 developed based on systematic analyses of the structural and functional
943 requirements of siRNA duplexes in Drosophila cell free lysates.
944 
945 They have systematically analyzed the silencing efficiency of siRNA
946 duplexes as a function of the length of the siRNAs, the length of the
947 3'-overhangs, and the sequence in the overhangs.  More specifically, the
948 most effective siRNA were those where:
949 
950 The duplexes are composed of 21-nucleotide sense and 21-nt antisense
951 strands, paired in a manner to have a 19-nt duplex region and a 2-nt
952 overhang at each 3'-terminus.
953 
954 The sense strand of the 19-nt duplex is homologous to the target gene.
955 
956 The 3'-overhangs contain either UU or dTdT dinucleotides.
957 
958 The overhangs ensure that the sequence-specific endonuclease complexes
959 (siRNPs) are formed with approximately equal ratios of sense and
960 antisense target RNA cleaving siRNPs.
961 
962 The 3'-overhang in the sense strand provides no contribution to
963 recognition as it is believed the antisense siRNA strand guides target
964 recognition.  Therefore, 3'-overhang of the antisense sequences is
965 complementary to the target mRNA but the symmetrical 3'-overhang of the
966 sense siRNA oligo does not need to correspond to the mRNA.  The use of
967 deoxythymidines in both 3'-overhangs may increase nuclease resistance,
968 although siRNA duplexes with either UU or dTdT overhangs have both
969 proven effective.
970 
971 It is recommended that the targeted region of the mRNA is within the
972 ORF.  Both the 5' and 3' UTRs and regions near the start codon are not
973 recommended for targeting as these may be richer in regulatory protein
974 binding sites; UTR-binding proteins and/or translation initiation
975 complexes may interfere with binding of the siRNP endonuclease complex.
976 (However, because these guidelines continue to evolve and be refined,
977 the Tool gives users the option to search for targets within the UTRs if
978 so desired.)
979 
980 Taking these above characteristics and phenomena into account, the
981 general strategy for siRNA sequence design is summarized in the box at
982 right.
983 
984 To date, successful silencing has been achieved using this strategy to
985 select the target sequence.  Based on feedback from customers and
986 collaborators where transfection conditions have been optimized, siRNA
987 duplexes designed using this method will work 70-80% of the time.  This
988 percentage is consistent with reports from Tuschl, Elbashir, and others
989 
990 
991 ============================================================================
992 
993 http://www.xeragon.com/siRNA_support.html
994 Nov 2002
995 
996 siRNA Target Sequence Design
997 
998 Introduction
999 
1000 Initial studies of mammalian RNAi suggest that the most efficient
1001 gene-silencing effect is achieved using double-stranded siRNA having a
1002 19-nucleotide complementary region and a 2-nucleotide 3' overhang at
1003 each end (see examples below).  Current models of the RNAi mechanism
1004 suggest that the antisense siRNA strand recognizes the specific gene
1005 target.
1006 
1007 In gene-specific RNAi, the open reading frame segment of the mRNA is
1008 usually targeted.  The search for an appropriate target sequence should
1009 begin 50-100 nucleotides downstream of the start codon.  To avoid
1010 interference from mRNA regulatory proteins, sequences in the 5' or 3'
1011 untranslated region or near the start codon should not be targeted.
1012 
1013 Shortly after the first reports on the use of RNAi in mammalian cells, a
1014 set of simple rules for siRNA target site selection were developed
1015 (http://www.mpibpc.gwdg.de/abteilungen/100/105/sirna.html).  One key
1016 feature in this early set of rules was the suggestion to look for
1017 specific sequence motifs to find the most appropriate siRNA targets.
1018 This original set of rules suggested that siRNA target sites with the
1019 sequence AA(N19)TT (Example 3) are the most appropriate target sequence.
1020 If an appropriate sequence could not be found, an AA(N21) (Example 1) or
1021 CA(N21) sequence motif could be considered.  In these situations,
1022 because the sense strand does not appear to be involved in target
1023 recognition, the sequence of the sense strand can be synthesized with a
1024 dTdT regardless of the target sequence.
1025 
1026 A number of laboratories have used these rules to create siRNA
1027 oligonucleotides that successfully silenced target gene expression.
1028 However, based on feedback from multiple laboratories, it does not
1029 appear that there is a strict need for the siRNA target sequence to
1030 begin with AA.
1031 
1032 Based on the results of an extensive study of a single gene, in which
1033 approximately 70 different siRNA oligonucleotide target sequences were
1034 used for RNAi, we suggest a modified set of rules (see below).  The
1035 results of this recent work, indicate that choosing a 21 nt region of
1036 the mRNA with a GC content as close as possible to 50% is a more
1037 important consideration than choosing a target sequence that begins with
1038 AA.  Another key consideration in target selection is to avoid having
1039 more than three guanosines in a row, since poly G sequences can
1040 hyperstack and form agglomerates that potentially interfere with the
1041 siRNA silencing mechanism.  It is recommended that these two rules be a
1042 primary consideration when designing and siRNA.
1043 
1044 There are some advantages to using a target sequence that begins with
1045 AA.  The use of a target sequence that begins with an AA allows dT
1046 residues to be used to create the 3'-end of the siRNA, this simplifies
1047 the synthesis and decreases the cost of the siRNA.  Although duplexes
1048 synthesized with UU overhangs appear to work as well as dTdT overhangs,
1049 oligonucleotides with a dTdT overhang cost less and are potentially more
1050 resistant to nucleases.  However, if the GC content is higher than 60%
1051 alternate regions of the coding region of the mRNA should be considered.
1052 Our suggestions for criteria to consider when designing an siRNA
1053 oligonucleotide are summarized in the "new relaxed set of rules" below.
1054 In addition, examples of siRNA oligonucleotides that start with an AA
1055 sequence as well as examples of siRNA oligonucleotides that do not are
1056 given.
1057 
1058 Current Considerations for siRNA Design (19 complementary bases and 2
1059 bases overhang)
1060 
1061 1.Choose a 21 (Example 1) or a 23 (Example 2 and 3) nt sequence in the
1062 coding region of the mRNA with a GC ratio as close to 50% as possible.
1063 Ideally the GC ratio will be between 45% and 55%.  An siRNA with 60% GC
1064 content has worked in many cases, however when an siRNA with 70% GC
1065 content is used for RNAi typically a sharp decrease in the level of
1066 silencing is observed.  Avoid regions within 50-100 nt of the AUG start
1067 codon or 50-100 nt of the termination codon.
1068 
1069 2.Avoid more than three guanosines in a row.  Poly G sequences can
1070 hyperstack and therefore form agglomerates that potentially interfere in
1071 the siRNA silencing mechanism.
1072 
1073 3.Preferentially choose target sequences that start with two adenosines
1074 (Example 1).  This will make synthesis easier, more economical, and
1075 create siRNA that is potentially more resistant to nucleases.  When a
1076 sequence that starts with AA is used, siRNA with dTdT overhangs can be
1077 produced.  If it is not possible to find a sequence that starts with AA
1078 and matches rules 1and 2, choose any 23 nt region of the coding sequence
1079 with a GC content between 45 and 55% that does not have more than three
1080 guanosines in a row (Example 3).
1081 
1082 4.Ensure that your target sequence is not homologous to any other genes.
1083 It is strongly recommended that a BLAST search of the target sequence be
1084 performed to prevent the silencing of unwanted genes with a similar
1085 sequence.
1086 
1087 5.Based on feedback from various customers, labelling the 3'-end of the
1088 sense strand gives the best results with respect to not interfering with
1089 the gene silencing mechanism of siRNA.  Consult our custom siRNA page
1090 for available modifications.
1091 
1092 When these rules are used for siRNA target sequence design RNAi
1093 effectively silences genes in more than 80% of cases.  Current data
1094 indicate that there are regions of some mRNAs where gene silencing does
1095 not work.  To help ensure that a given target gene is silenced, it is
1096 advised that at least two target sequences as far apart on the gene as
1097 possible be chosen.
1098 
1099 This method does not take into account mRNA secondary structure.  At
1100 present it does not appear that mRNA secondary structure has a
1101 significant impact on gene silencing.
1102 
1103 
1104 */
1105