1 /* @source sirna application
2 **
3 ** Finds siRNA duplexes in mRNA
4 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
5 ** @@
6 **
7 ** This program is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU General Public License
9 ** as published by the Free Software Foundation; either version 2
10 ** of the License, or (at your option) any later version.
11 **
12 ** This program is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ** GNU General Public License for more details.
16 **
17 ** You should have received a copy of the GNU General Public License
18 ** along with this program; if not, write to the Free Software
19 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 ******************************************************************************/
21
22
23 #include "emboss.h"
24 #include <math.h>
25 #include <stdlib.h>
26
27 #define NOT_WANTED -100
28
29
30
31
32 static void sirna_report(AjPReport report, const AjPSeq seq,
33 AjBool poliii, AjBool aa, AjBool tt, AjBool polybase,
34 AjBool context, AjPSeqout seqout);
35 static ajint sirna_begin(const AjPSeq seq, AjPReport report, AjBool poliii,
36 AjBool aa, AjBool tt, AjBool polybase,
37 AjBool context);
38 static void sirna_new_value(AjPList list, ajint pos, ajint score,
39 ajint GCcount);
40 static int sirna_compare_score(const void* v1, const void* v2);
41 static void sirna_output(const AjPList list,
42 AjPFeattable TabRpt, const AjPSeq seq,
43 AjBool context, AjPSeqout seqout);
44
45
46
47
48 /* @datastatic PValue *********************************************************
49 **
50 ** structure for position, score and GC count at the start of a window
51 **
52 ** @alias SValue
53 ** @alias OValue
54 **
55 ** @attr pos [ajint] Position
56 ** @attr GCcount [ajint] GC count
57 ** @attr score [ajint] Score
58 ******************************************************************************/
59
60 typedef struct SValue
61 {
62 ajint pos;
63 ajint GCcount;
64 ajint score;
65 } OValue;
66 #define PValue OValue*
67
68
69
70
71 /* @prog sirna ****************************************************************
72 **
73 ** Finds siRNA duplexes in mRNA
74 **
75 ******************************************************************************/
76
main(int argc,char ** argv)77 int main(int argc, char **argv)
78 {
79 AjPSeqall seqall;
80 AjPReport report = NULL;
81 AjBool poliii;
82 AjBool aa;
83 AjBool tt;
84 AjBool polybase;
85 AjBool context;
86 AjPSeqout seqout;
87
88 AjPSeq seq = NULL;
89
90
91 embInit("sirna",argc,argv);
92
93 seqall = ajAcdGetSeqall("sequence");
94 poliii = ajAcdGetBoolean("poliii");
95 aa = ajAcdGetBoolean("aa");
96 tt = ajAcdGetBoolean("tt");
97 polybase = ajAcdGetBoolean("polybase");
98 context = ajAcdGetBoolean("context");
99 report = ajAcdGetReport("outfile");
100 seqout = ajAcdGetSeqoutall("outseq");
101
102 while(ajSeqallNext(seqall, &seq))
103 sirna_report(report, seq, poliii, aa, tt, polybase, context, seqout);
104
105 ajReportSetSeqstats(report, seqall);
106
107 ajSeqallDel(&seqall);
108 ajSeqDel(&seq);
109 ajReportClose(report);
110 ajSeqoutDel(&seqout);
111 ajReportDel(&report);
112
113 embExit();
114
115 return 0;
116 }
117
118
119
120
121 /* @funcstatic sirna_report ***************************************************
122 **
123 ** finds regions that can be used as siRNAs
124 **
125 ** @param [u] report [AjPReport] Undocumented
126 ** @param [r] seq [const AjPSeq] Sequence
127 ** @param [r] poliii [AjBool] True if want poliii expressable probes
128 ** @param [r] aa [AjBool] True if want AA at start of region
129 ** @param [r] tt [AjBool] True if want TT at end of region
130 ** @param [r] polybase [AjBool] True if we allow 4-mers of any base
131 ** @param [r] context [AjBool] True if we wish to show the 2 bases before
132 ** the probe region
133 ** @param [u] seqout [AjPSeqout] Ouput sequence object
134 ** @return [void]
135 ** @@
136 ******************************************************************************/
137
138
sirna_report(AjPReport report,const AjPSeq seq,AjBool poliii,AjBool aa,AjBool tt,AjBool polybase,AjBool context,AjPSeqout seqout)139 static void sirna_report(AjPReport report, const AjPSeq seq, AjBool poliii,
140 AjBool aa, AjBool tt, AjBool polybase,
141 AjBool context, AjPSeqout seqout)
142 {
143 AjPFeattable TabRpt=NULL; /* output feature table for report */
144 ajint position;
145 ajint i;
146 ajint j;
147 ajint window_len = 23; /* the length of the required siRNA region */
148 ajint shift = 1; /* want to evaluate at each position */
149 ajint count_gc; /* number of G + C bases in the window */
150 char c;
151 const char *cseq;
152 ajint aaaa_count;
153 ajint cccc_count;
154 ajint gggg_count;
155 ajint tttt_count;
156 AjPList scorelist;
157 ajint score;
158 AjPStr newstr = NULL;
159
160 /* scores */
161 ajint CDS_50 = NOT_WANTED;
162 ajint CDS_100 = -2;
163 ajint A_start = +3;
164 ajint TT_end = +1;
165 ajint GGGG = NOT_WANTED;
166
167 ajint GC_5 = NOT_WANTED;
168 ajint GC_6 = 0;
169 ajint GC_7 = +2;
170 ajint GC_8 = +4;
171 ajint GC_9 = +5;
172 ajint GC_10 = +6;
173 ajint GC_11 = +5;
174 ajint GC_12 = +4;
175 ajint GC_13 = +2;
176 ajint GC_14 = +0;
177 ajint GC_15 = NOT_WANTED;
178
179 /* heavy penalty for position 2 not 'A' */
180 ajint POS_2_NOT_A = NOT_WANTED;
181
182 /*
183 ** if want Pol III expressable probes, then have a heavy
184 ** penalty for not NAR(N17)YNN
185 */
186 ajint NOT_POLIII = NOT_WANTED;
187
188 ajint e;
189 ajint CDS_begin;
190
191 /* initalise report feature table */
192 TabRpt = ajFeattableNewSeq(seq);
193
194 /* get start of CDS region */
195 CDS_begin = sirna_begin(seq, report, poliii, aa, tt, polybase, context);
196
197 cseq = ajSeqGetSeqC(seq);
198
199 /*
200 ** Using a list holding info on position, %GC, score. Sort by
201 ** score and write the top N over the threshold to the report file
202 */
203 scorelist = ajListNew();
204
205 /*
206 ** initialise the GC count for the first window
207 ** we count the %GC in the 20 bases from position 2 to 21 of the 23
208 */
209 count_gc = 0;
210 for (i=CDS_begin+1; i < CDS_begin+window_len-2; i++)
211 {
212 c = cseq[i];
213 if(tolower((int)c) == 'g' || tolower((int)c) == 'c')
214 count_gc++;
215 }
216
217 e = (ajSeqGetEnd(seq) - window_len); /* last position a window can start */
218 for(position=CDS_begin; position < e; position+=shift)
219 {
220 /* initialise score */
221 score = 0;
222
223 /* penalty for being too close to the CDS start */
224 if(CDS_begin != 0)
225 {
226 /* check for a CDS start site */
227 if(position < CDS_begin + 50)
228 score += CDS_50;
229 else if(position < CDS_begin + 100)
230 score += CDS_100;
231 }
232
233 /* set score for GC bases */
234 if(count_gc <= 5)
235 score += GC_5;
236 else if(count_gc == 6)
237 score += GC_6;
238 else if(count_gc == 7)
239 score += GC_7;
240 else if(count_gc == 8)
241 score += GC_8;
242 else if(count_gc == 9)
243 score += GC_9;
244 else if(count_gc == 10)
245 score += GC_10;
246 else if(count_gc == 11)
247 score += GC_11;
248 else if(count_gc == 12)
249 score += GC_12;
250 else if(count_gc == 13)
251 score += GC_13;
252 else if(count_gc == 14)
253 score += GC_14;
254 else if(count_gc >= 15)
255 score += GC_15;
256
257
258 ajDebug("%d) GC score = %d\n", position, score);
259
260 /* check for AA at start */
261 if(tolower((int)cseq[position]) == 'a')
262 {
263 score += A_start;
264 ajDebug("Found A at start\n");
265 }
266 else if(aa)
267 score += NOT_WANTED;
268
269 /* check for TT at end */
270 if(((tolower((int)cseq[position+window_len-1]) == 't') &&
271 (tolower((int)cseq[position+window_len-2]) == 't')) ||
272 ((tolower((int)cseq[position+window_len-1]) == 'u') &&
273 (tolower((int)cseq[position+window_len-2]) == 'u')))
274 {
275 score += TT_end;
276 ajDebug("Found TT at end\n");
277 }
278 else if(tt)
279 score += NOT_WANTED;
280
281
282 /* check for GGGG and other 4-mers */
283 aaaa_count = 0;
284 cccc_count = 0;
285 gggg_count = 0;
286 tttt_count = 0;
287 for(j=position; j < position+window_len; j++)
288 {
289 c = cseq[j];
290 if(tolower((int)c) != 'a')
291 aaaa_count = 0;
292 else
293 aaaa_count++;
294
295 if(tolower((int)c) != 'c')
296 cccc_count = 0;
297 else
298 cccc_count++;
299
300 if(tolower((int)c) != 'g')
301 gggg_count = 0;
302 else
303 gggg_count++;
304
305 if(tolower((int)c) != 't' &&
306 tolower((int)c) != 'u')
307 tttt_count = 0;
308 else
309 tttt_count++;
310
311 /* always bomb out if we find GGGG */
312 if(gggg_count >= 4)
313 {
314 score += GGGG;
315 ajDebug("Found a GGGG\n");
316 break;
317 }
318
319 /* bomb out if we don't want any 4-mer and we find one */
320 if(!polybase)
321 {
322 if(aaaa_count >= 4)
323 {
324 score += NOT_WANTED;
325 break;
326 }
327
328 if(cccc_count >= 4)
329 {
330 score += NOT_WANTED;
331 break;
332 }
333
334 if(tttt_count >= 4) {
335 score += NOT_WANTED;
336 break;
337 }
338 }
339 }
340
341 /* check to see if the second base is an 'A' */
342 if(tolower((int)cseq[position+1]) != 'a')
343 score += POS_2_NOT_A;
344
345 /*
346 ** if want probes that can be expressed from Pol III
347 ** expression vectors
348 ** add a penalty if they do not have the correct purine/pyrimidine
349 ** pattern
350 */
351 if(poliii &&
352 ((tolower((int)cseq[position+2]) != 'a' &&
353 tolower((int)cseq[position+2]) != 'g') ||
354 (tolower((int)cseq[position+window_len-3]) != 'c' &&
355 tolower((int)cseq[position+window_len-3]) != 'u' &&
356 tolower((int)cseq[position+window_len-3]) != 't')))
357 {
358 score += NOT_POLIII;
359 ajDebug("Not a PolIII region\n");
360 }
361
362 /* save the score and other details */
363 if(score > 0)
364 sirna_new_value(scorelist, position, score, count_gc);
365
366 /*
367 ** update count of GC bases
368 ** dropping this base off the 5' end of the window
369 */
370 c = cseq[position+1];
371 ajDebug("%d) score = %d\n", position, score);
372 ajDebug("** base = %c\n", c);
373
374 if(tolower((int)c) == 'g' || tolower((int)c) == 'c')
375 count_gc--;
376
377 /* new base shifted onto 3' end of window -2 */
378 c = cseq[position+window_len-2];
379 if(tolower((int)c) == 'g' || tolower((int)c) == 'c')
380 count_gc++;
381
382 ajDebug("Next GC count=%d\n", count_gc);
383 }
384
385 /* now sort the list by score then position */
386 ajListSort(scorelist, sirna_compare_score);
387
388
389
390 /* now pop off the positive scores and write them to the report object */
391 sirna_output(scorelist, TabRpt, seq, context, seqout);
392 ajReportWrite(report, TabRpt, seq);
393
394 ajSeqoutClose(seqout);
395 ajListFreeData(&scorelist);
396 ajFeattableDel(&TabRpt);
397 ajStrDel(&newstr);
398
399 return;
400 }
401
402
403
404
405 /* @funcstatic sirna_begin ***************************************************
406 **
407 ** Finds the start position of the CDS or uses -sbegin if no feature table
408 ** Adds some explanation to the report header.
409 **
410 ** @param [r] seq [const AjPSeq] Sequence
411 ** @param [u] report [AjPReport] Report object
412 ** @param [r] poliii [AjBool] True if want poliii expressable probes
413 ** @param [r] aa [AjBool] True if want AA at start of region
414 ** @param [r] tt [AjBool] True if want TT at end of region
415 ** @param [r] polybase [AjBool] True if we allow 4-mers of any base
416 ** @param [r] context [AjBool] True if we wish to show the 2 bases
417 ** before the probe region
418 ** @return [ajint] start of CDS region (using positions starting from 0)
419 ** @@
420 ******************************************************************************/
421
sirna_begin(const AjPSeq seq,AjPReport report,AjBool poliii,AjBool aa,AjBool tt,AjBool polybase,AjBool context)422 static ajint sirna_begin(const AjPSeq seq, AjPReport report, AjBool poliii,
423 AjBool aa, AjBool tt, AjBool polybase,
424 AjBool context)
425 {
426 AjPFeattable featab = NULL; /* input sequence feature table */
427 ajint begin = 0;
428 AjPFeature gf = NULL;
429 AjIList iter = NULL;
430 AjPStr type = NULL;
431 AjPStr head = NULL;
432 AjPStr head2 = NULL;
433
434 /* get input sequence features */
435 featab = ajSeqGetFeatCopy(seq);
436 type = ajStrNew();
437 ajStrAssignC(&type, "CDS");
438 head = ajStrNew();
439 head2 = ajStrNew();
440
441 ajDebug("sirna_begin()\n");
442
443 /* say something about the options in the report header */
444 if(poliii)
445 ajStrAppendC(&head, "Selecting only regions suitable for "
446 "PolIII expression vectors\n");
447
448 if(aa)
449 ajStrAppendC(&head, "Selecting only siRNA regions starting with 'AA'\n");
450
451 if(tt)
452 ajStrAppendC(&head, "Selecting only siRNA regions ending with 'TT'\n");
453
454 if(!polybase)
455 ajStrAppendC(&head, "Selecting only siRNA regions with no 4-mers "
456 "of any base\n");
457
458 if(context)
459 ajStrAppendC(&head, "The forward sense sequence shows the first 2 "
460 "bases of\nthe 23 base region in brackets, this "
461 "should be ignored\nwhen ordering siRNA probes.\n");
462
463 /* are there any features - find the first CDS feature */
464 if(ajFeattableGetSize(featab))
465 {
466 iter = ajListIterNewread(featab->Features);
467 while(!ajListIterDone(iter))
468 {
469 gf = ajListIterGet(iter);
470 if(ajStrMatchS(type, ajFeatGetType(gf)))
471 {
472 /* is the feature 'CDS'? */
473 begin = ajFeatGetStart(gf)-1;
474 ajFmtPrintS(&head2, "CDS region found in feature table "
475 "starting at %d",
476 begin+1);
477 /* ajDebug("Found a CDS at %d\n", begin);*/
478 break;
479 }
480 }
481 ajListIterDel(&iter);
482 }
483
484 /* if didn't find a CDS, assume -sbegin is the CDS */
485 if(begin == 0)
486 {
487 begin = ajSeqGetBegin(seq)-1;
488 if(begin == 0)
489 {
490 ajDebug("begin = 0\n");
491 ajFmtPrintS(&head2, "%s%s%s%s",
492 "No CDS region was found in the feature table.\n",
493 "No CDS region was indicated by setting -sbegin.\n",
494 "There will therefore be no penalty for siRNAs found ",
495 "in the first 100 bases.");
496 }
497 else
498 {
499 ajDebug("begin != 0\n");
500 ajFmtPrintS(&head2, "%s%s%s%d",
501 "No CDS region was found in the feature table.\n",
502 "The CDS region is assumed to start at the position ",
503 "given by -sbegin: ",
504 begin+1);
505 }
506
507 ajDebug("CDS specified at %d\n", begin);
508 }
509
510 ajStrAppendS(&head, head2);
511 ajReportSetHeaderS(report, head);
512
513 ajDebug("sirna_begin begin=%d\n", begin);
514
515 ajFeattableDel(&featab);
516 ajStrDel(&type);
517 ajStrDel(&head);
518 ajStrDel(&head2);
519
520 return begin;
521 }
522
523
524
525
526 /* @funcstatic sirna_new_value ***********************************************
527 **
528 ** creates a PValue object and adds it to the list
529 **
530 ** @param [u] list [AjPList] list to add value to
531 ** @param [r] pos [ajint] position in sequence
532 ** @param [r] score [ajint] score
533 ** @param [r] GCcount [ajint] number of GC bases
534 ** @return [void]
535 ** @@
536 ******************************************************************************/
537
sirna_new_value(AjPList list,ajint pos,ajint score,ajint GCcount)538 static void sirna_new_value(AjPList list, ajint pos, ajint score,
539 ajint GCcount)
540 {
541
542 PValue value;
543
544 AJNEW0(value);
545 value->pos = pos;
546 value->GCcount = GCcount;
547 value->score = score;
548
549 ajListPushAppend(list, value);
550
551 return;
552 }
553
554
555
556
557 /* @funcstatic sirna_compare_score *******************************************
558 **
559 ** Compares the scores of two positions for use in sorting
560 **
561 ** @param [r] v1 [const void*] First structure
562 ** @param [r] v2 [const void*] Second structure
563 ** @return [int] -1 if first value should sort before second, +1 if the
564 ** second value should sort first. 0 if they are identical
565 ** in score
566 ** @@
567 ******************************************************************************/
568
sirna_compare_score(const void * v1,const void * v2)569 static int sirna_compare_score(const void* v1, const void* v2)
570 {
571 const PValue s1;
572 const PValue s2;
573
574 s1 = *(PValue const *)v1;
575 s2 = *(PValue const *)v2;
576
577
578 if(s1->score > s2->score)
579 return -1;
580 else if(s1->score < s2->score)
581 return +1;
582
583 /* sort by position if scores are equal */
584 if(s1->pos < s2->pos)
585 return -1;
586 else if(s1->pos > s2->pos)
587 return +1;
588
589 return 0;
590 }
591
592
593
594
595 /* @funcstatic sirna_output *********************************************
596 **
597 ** Output the probes
598 **
599 ** @param [r] list [const AjPList] list of PValue structures
600 ** @param [u] TabRpt [AjPFeattable] feature table
601 ** @param [r] seq [const AjPSeq] Sequence
602 ** @param [r] context [AjBool] True if we wish to show the 2 bases before
603 ** the probe region
604 ** @param [u] seqout [AjPSeqout] Ouput sequence object
605 ** @return [void]
606 ** @@
607 ******************************************************************************/
608
sirna_output(const AjPList list,AjPFeattable TabRpt,const AjPSeq seq,AjBool context,AjPSeqout seqout)609 static void sirna_output(const AjPList list,
610 AjPFeattable TabRpt, const AjPSeq seq,
611 AjBool context, AjPSeqout seqout)
612 {
613 AjIList iter;
614 PValue value;
615 AjPFeature gf;
616 AjPStr tmpStr;
617 AjPStr subseq;
618 AjPStr source;
619 AjPStr type;
620 AjPStr name;
621 AjPStr desc;
622 AjPSeq seq23;
623
624
625 tmpStr = ajStrNew();
626 subseq = ajStrNew();
627 source = ajStrNewC("siRNA");
628 type = ajStrNewC("misc_feature");
629 name = ajStrNew(); /* new name of the 23 base target sequence */
630 desc = ajStrNew(); /* new description of 23 base target sequence */
631 seq23 = ajSeqNewRes(24); /* 23-base target sequence */
632
633 /* if no hits then ignore much of this routine */
634 if(ajListGetLength(list))
635 {
636 iter = ajListIterNewread(list);
637 while((value = ajListIterGet(iter)) != NULL)
638 {
639 /* ajDebug("(%d) %d %d\n", value->pos+1, value->score,
640 value->GCcount); */
641
642 gf = ajFeatNew(TabRpt, source, type, value->pos+1,
643 value->pos+23, (float)value->score, '+', 0);
644
645 ajFmtPrintS(&tmpStr, "*gc %4.1f",
646 ((float)value->GCcount*100.0)/20.0);
647 ajFeatTagAddSS(gf, NULL, tmpStr);
648
649 /* get the sequences to order */
650 if(context)
651 {
652 /*
653 ** get the first two characters of the sequence before
654 ** the siRNA probe region
655 */
656 ajStrAssignC(&subseq, "(");
657 ajStrAppendSubS(&subseq, ajSeqGetSeqS(seq),
658 value->pos, value->pos+1);
659 ajStrAppendC(&subseq, ")");
660 }
661
662 ajStrAppendSubS(&subseq, ajSeqGetSeqS(seq),
663 value->pos+2, value->pos+20);
664 ajStrFmtUpper(&subseq);
665 ajStrExchangeKK(&subseq, 'T', 'U');
666 ajStrAppendC(&subseq, "dTdT");
667 ajFmtPrintS(&tmpStr, "*forward %S", subseq);
668 ajFeatTagAddSS(gf, NULL, tmpStr);
669
670 ajStrAssignSubS(&subseq, ajSeqGetSeqS(seq),
671 value->pos+2, value->pos+20);
672 ajStrFmtUpper(&subseq);
673 ajSeqstrReverse(&subseq);
674 ajStrExchangeKK(&subseq, 'T', 'U');
675 ajStrAppendC(&subseq, "dTdT");
676 ajFmtPrintS(&tmpStr, "*reverse %S", subseq);
677 ajFeatTagAddSS(gf, NULL, tmpStr);
678
679 /*
680 ** now write out the 23 base bit of sequence to the seqout file
681 ** get sequence
682 */
683 ajDebug("Now write sequence file\n");
684 ajStrAssignSubS(&subseq, ajSeqGetSeqS(seq),
685 value->pos, value->pos+22);
686 ajSeqAssignSeqS(seq23, subseq);
687 ajSeqSetNuc(seq23);
688
689 /* give it a name */
690 ajDebug("Doing name\n");
691 ajStrAssignS(&name, ajSeqGetNameS(seq));
692 ajStrAppendC(&name, "_");
693 ajStrFromInt(&tmpStr, value->pos+1);
694 ajStrAppendS(&name, tmpStr);
695 ajSeqAssignNameS(seq23, name);
696
697 /* get description */
698 ajDebug("Doing description\n");
699 ajFmtPrintS(&desc, "%%GC %4.1f Score %d ",
700 ((float)value->GCcount*100.0)/20.0,
701 value->score);
702 ajStrAppendS(&desc, ajSeqGetDescS(seq));
703 ajSeqAssignDescS(seq23, desc);
704
705 ajDebug("Write seq23\n");
706 ajSeqoutWriteSeq(seqout, seq23);
707
708 /* prepare sequence string for re-use */
709 ajStrSetClear(&subseq);
710 }
711 ajListIterDel(&iter);
712 }
713
714 ajStrDel(&tmpStr);
715 ajStrDel(&subseq);
716 ajStrDel(&source);
717 ajStrDel(&type);
718 ajStrDel(&name);
719 ajStrDel(&desc);
720 ajSeqDel(&seq23);
721
722 return;
723 }
724
725
726
727
728 /*
729 For the definitive guide to siRNA, see:
730 http://www.mpibpc.gwdg.de/abteilungen/100/105/sirna.html
731
732
733 The siRNA user guide (revised October 11, 2002)
734
735 Selection of siRNA duplexes from the target mRNA sequence
736
737 Using Drosophila melanogaster lysates (Tuschl et al. 1999), we have
738 systematically analyzed the silencing efficiency of siRNA duplexes as a
739 function of the length of the siRNAs, the length of the overhang and the
740 sequence in the overhang (Elbashir et al. 2001c). The most efficient
741 silencing was obtained with siRNA duplexes composed of 21-nt sense and
742 21-nt antisense strands, paired in a manner to have a 2-nt 3' overhang.
743 The sequence of the 2-nt 3' overhang makes a small contribution to the
744 specificity of target recognition restricted to the unpaired nucleotide
745 adjacent to the first base pair. 2'-Deoxynucleotides in the 3'
746 overhangs are as efficient as ribonucleotides, but are often cheaper to
747 synthesize and probably more nuclease resistant. We used to select
748 siRNA sequences with TT in the overhang.
749
750 The targeted region is selected from a given cDNA sequence beginning 50
751 to 100 nt downstream of the start codon. Initially, 5' or 3' UTRs and
752 regions nearby the start codon were avoided assuming that UTR-binding
753 proteins and/or translation initiation complexes may interfere with
754 binding of the siRNP or RISC endonuclease complex. More recently,
755 however, we have targeted 3'-UTRs and have not experienced any problems
756 in knocking down the targeted genes. In order to design a siRNA duplex,
757 we search for the 23-nt sequence motif AA(N19)TT (N, any nucleotide) and
758 select hits with approx. 50% G/C-content (30% to 70% has also worked in
759 our hands). If no suitable sequences are found, the search is extended
760 using the motif NA(N21). The sequence of the sense siRNA corresponds to
761 (N19)TT or N21 (position 3 to 23 of the 23-nt motif), respectively. In
762 the latter case, we convert the 3' end of the sense siRNA to TT. The
763 rationale for this sequence conversion is to generate a symmetric duplex
764 with respect to the sequence composition of the sense and antisense 3'
765 overhangs. The antisense siRNA is synthesized as the complement to
766 position 1 to 21 of the 23-nt motif. Because position 1 of the 23-nt
767 motif is not recognized sequence-specifically by the antisense siRNA,
768 the 3'-most nucleotide residue of the antisense siRNA, can be chosen
769 deliberately. However, the penultimate nucleotide of the antisense
770 siRNA (complementary to position 2 of the 23-nt motif) should always be
771 complementary to the targeted sequence. For simplifying chemical
772 synthesis, we always use TT. More recently, we preferentially select
773 siRNAs corresponding to the target motif NAR(N17)YNN, where R is purine
774 (A, G) and Y is pyrimidine (C, U). The respective 21-nt sense and
775 antisense siRNAs therefore begin with a purine nucleotide and can also
776 be expressed from pol III expression vectors without a change in
777 targeting site; expression of RNAs from pol III promoters is only
778 efficient when the first transcribed nucleotide is a purine.
779
780 We always design siRNAs with symmetric 3' TT overhangs, believing that
781 symmetric 3' overhangs help to ensure that the siRNPs are formed with
782 approximately equal ratios of sense and antisense target RNA-cleaving
783 siRNPs (Elbashir et al. 2001b; Elbashir et al. 2001c). Please note
784 that the modification of the overhang of the sense sequence of the siRNA
785 duplex is not expected to affect targeted mRNA recognition, as the
786 antisense siRNA strand guides target recognition. In summary, no matter
787 what you do to your overhangs, siRNAs should still function to a
788 reasonable extent. However, using TT in the 3' overhang will always
789 help your RNA synthesis company to let you know when you accidentally
790 order a siRNA sequences 3' to 5' rather than in the recommended format
791 of 5' to 3'. You may think this is funny, but it has happened quite a
792 lot.
793
794 Compared to antisense or ribozyme technology, the secondary structure of
795 the target mRNA does not appear to have a strong effect on silencing.
796 We say that, because we have already knocked-down more than 20 genes
797 using a single, essentially randomly chosen siRNA duplex (Harborth et
798 al. 2001). Only 3 siRNA duplexes have been ineffective so far. In one
799 or two other cases, we have found siRNAs to be inactive because the
800 targeting site contained a single-nucleotide polymorphism. We were also
801 able to knock-down two genes simultaneously (e.g. lamin A/C and NuMA)
802 by using equal concentrations of siRNA duplexes.
803
804 We recommend to blast-search (NCBI database) the selected siRNA sequence
805 against EST libraries to ensure that only one gene is targeted. In
806 addition, we also recommend to knock-down your gene with two independent
807 siRNA duplexes to control for specificity of the silencing effect. If
808 selected siRNA duplexes do not function for silencing, please check for
809 sequencing errors of the gene, polymorphisms, and whether your cell line
810 is really from the expected species. Our initial studies on the
811 specificity of target recognition by siRNA duplexes indicate that a
812 single point mutation located in the paired region of an siRNA duplex is
813 sufficient to abolish target mRNA degradation (Elbashir et al. 2001c).
814 Furthermore, it is unknown if targeting of a gene by two different siRNA
815 duplexes is more effective than using a single siRNA duplex. We think
816 that the amount of siRNA-associating proteins is limiting for silencing
817 rather than the target accessibility.
818
819 Sequences of siRNA duplexes used in our studies
820
821 The sequences of siRNA duplexes described in the Nature paper (Elbashir
822 et al. 2001a):
823
824 Lamin A/C
825 targeted region (cDNA): 5' AACTGGACTTCCAGAAGAACATC
826 sense siRNA: 5' CUGGACUUCCAGAAGAACAdTdT
827 antisense siRNA: 5' UGUUCUUCUGGAAGUCCAGdTdT
828
829
830 Vimentin
831 targeted region (cDNA): 5' AACTACATCGACAAGGTGCGCTT
832 sense siRNA: 5' CUACAUCGACAAGGUGCGCdTdT
833 antisense siRNA: 5' GCGCACCUUGUCGAUGUAGdTdT
834
835
836 NuMA
837 targeted region (cDNA): 5' AAGGCGTGGCAGGAGAAGTTCTT
838 sense siRNA: 5' GGCGUGGCAGGAGAAGUUCdTdT
839 antisense siRNA: 5' GAACUUCUCCUGCCACGCCdTdT
840
841
842 Lamin B1
843 targeted region (cDNA): 5' AACGCGCTTGGTAGAGGTGGATT
844 sense siRNA: 5' CGCGCUUGGUAGAGGUGGAdTdT
845 antisense siRNA: 5' UCCACCUCUACCAAGCGCGdTdT
846
847
848 GL2 Luciferase
849 targeted region (cDNA): 5' AACGTACGCGGAATACTTCGATT
850 sense siRNA: 5' CGUACGCGGAAUACUUCGAdTdT
851 antisense siRNA: 5' UCGAAGUAUUCCGCGUACGdTdT
852
853 ============================================================================
854
855 siRNA description from http://www.oligoengine.com/
856 30 October 2002
857
858 siRNA: Background and Applications RNA interference, or RNAi, is a
859 phenomenon in which double stranded RNA (dsRNA) effects silencing of the
860 expression of genes that are highly homologous to either of the RNA
861 strands in the duplex. Gene silencing in RNAi results from the
862 degradation of mRNA sequences, and the effect has been used to determine
863 the function of many genes in Drosophilia, C. elegans, and many plant
864 species.
865
866 The duration of knockdown by siRNA can typically last for 7-10 days, and
867 has been shown to transfer to daughter cells. Of further note, siRNAs
868 are effective at quantities much lower than alternative gene silencing
869 methodologies, including antisense and ribozyme based strategies.
870
871 Considerable interest has developed in RNAi lately among life science
872 professionals. The ability to quickly and easily generate
873 loss-of-function phenotypes in mammalian cells has spawned a new wave of
874 research endeavors; companies are also exploring the potential of siRNA
875 for use in drug development and gene-specific therapeutics.
876
877 Due to various mechanisms of antiviral response to long dsRNA, RNAi at
878 first proved more difficult to establish in mammalian species. Then,
879 Tuschl, Elbashir, and others discovered that RNAi can be elicited very
880 effectively by well-defined 21-base duplex RNAs. When these small
881 interfering RNA, or siRNA, are added in duplex form with a transfection
882 agent to mammalian cell cultures, the 21-base-pair RNA acts in concert
883 with cellular components to silence the gene with sequence homology to
884 one of the siRNA sequences.
885
886 Strategies for the design of effective siRNA sequences have been
887 recently documented, most notably by Sayda Elbashir, Thomas Tuschl, et
888 al. Like all of the design applications in the OligoEngine Platform,
889 our siRNA Design Tool transforms these strategies into an automated
890 process that improves accuracy, saves time, and provides valuable
891 feedback to the user.
892
893 In brief, the Tool analyzes a user's gene sequence and applies various
894 algorithms to generate "candidate" siRNA sequences that are 19
895 nucleotides in length. Users then select the 2-nt 3' overhang to order
896 two (sense and antisense) 21-nucleotide RNA oligos.
897
898 Use either:
899
900 1) Type the accession number of a GenBank mRNA file associated with your
901 gene of interest. (Hint: for best results, use a GenBank file with
902 complete CDS)
903
904 2) Copy a complete mRNA sequence from another file and use the "Paste
905 Sequence" button to quickly input your sequence.
906
907 Next, select the general parameters for siRNA design. These include:
908
909 GC Content: 50% is optimal, and the Tool uses a default range of 45-55%.
910 You can set parameters from 30-70%, the max range for potentially
911 effective siRNA.
912
913 Leader: The tool uses AA as its default, the widely accepted standard
914 for the dinucleotide leader. CA leaders can also be effective and may
915 be provide a broader range of options, especially when searching for a
916 sequence homologous to two related genes.
917
918 UTR Search: While we do not recommend siRNA candidates targeting 5' or
919 3' UTR regions, the Tool gives users the option to design in UTRs (if
920 UTR sequences are available).
921
922 ORF length: For paste-in sequences, the Tool first predicts the ORF to
923 ensure siRNA design within the coding region. Setting a minimum ORF
924 length helps to make an accurate prediction and account for "false" stop
925 codons inside the CDS.
926
927
928
929 Choose the 3' overhang: Click to select dTdT (default setting and
930 generally recommended choice) or UU. Contact us for other custom
931 overhang options if required.
932
933 BLAST your siRNA: click the BLAST button to check for sequence homology
934 against our local NCBI genome databases. When complete, the button will
935 turn green- click it to view the BLAST results in a separate screen
936 (reports on any siRNA you purchase are saved for your future review).
937
938
939 As mentioned, the siRNA Design Tool is based on guidelines developed by
940 leading researchers in the study of siRNA and RNAi: chiefly, Sayda
941 Elbashir, Thomas Tuschl, and their coworkers. These in turn were
942 developed based on systematic analyses of the structural and functional
943 requirements of siRNA duplexes in Drosophila cell free lysates.
944
945 They have systematically analyzed the silencing efficiency of siRNA
946 duplexes as a function of the length of the siRNAs, the length of the
947 3'-overhangs, and the sequence in the overhangs. More specifically, the
948 most effective siRNA were those where:
949
950 The duplexes are composed of 21-nucleotide sense and 21-nt antisense
951 strands, paired in a manner to have a 19-nt duplex region and a 2-nt
952 overhang at each 3'-terminus.
953
954 The sense strand of the 19-nt duplex is homologous to the target gene.
955
956 The 3'-overhangs contain either UU or dTdT dinucleotides.
957
958 The overhangs ensure that the sequence-specific endonuclease complexes
959 (siRNPs) are formed with approximately equal ratios of sense and
960 antisense target RNA cleaving siRNPs.
961
962 The 3'-overhang in the sense strand provides no contribution to
963 recognition as it is believed the antisense siRNA strand guides target
964 recognition. Therefore, 3'-overhang of the antisense sequences is
965 complementary to the target mRNA but the symmetrical 3'-overhang of the
966 sense siRNA oligo does not need to correspond to the mRNA. The use of
967 deoxythymidines in both 3'-overhangs may increase nuclease resistance,
968 although siRNA duplexes with either UU or dTdT overhangs have both
969 proven effective.
970
971 It is recommended that the targeted region of the mRNA is within the
972 ORF. Both the 5' and 3' UTRs and regions near the start codon are not
973 recommended for targeting as these may be richer in regulatory protein
974 binding sites; UTR-binding proteins and/or translation initiation
975 complexes may interfere with binding of the siRNP endonuclease complex.
976 (However, because these guidelines continue to evolve and be refined,
977 the Tool gives users the option to search for targets within the UTRs if
978 so desired.)
979
980 Taking these above characteristics and phenomena into account, the
981 general strategy for siRNA sequence design is summarized in the box at
982 right.
983
984 To date, successful silencing has been achieved using this strategy to
985 select the target sequence. Based on feedback from customers and
986 collaborators where transfection conditions have been optimized, siRNA
987 duplexes designed using this method will work 70-80% of the time. This
988 percentage is consistent with reports from Tuschl, Elbashir, and others
989
990
991 ============================================================================
992
993 http://www.xeragon.com/siRNA_support.html
994 Nov 2002
995
996 siRNA Target Sequence Design
997
998 Introduction
999
1000 Initial studies of mammalian RNAi suggest that the most efficient
1001 gene-silencing effect is achieved using double-stranded siRNA having a
1002 19-nucleotide complementary region and a 2-nucleotide 3' overhang at
1003 each end (see examples below). Current models of the RNAi mechanism
1004 suggest that the antisense siRNA strand recognizes the specific gene
1005 target.
1006
1007 In gene-specific RNAi, the open reading frame segment of the mRNA is
1008 usually targeted. The search for an appropriate target sequence should
1009 begin 50-100 nucleotides downstream of the start codon. To avoid
1010 interference from mRNA regulatory proteins, sequences in the 5' or 3'
1011 untranslated region or near the start codon should not be targeted.
1012
1013 Shortly after the first reports on the use of RNAi in mammalian cells, a
1014 set of simple rules for siRNA target site selection were developed
1015 (http://www.mpibpc.gwdg.de/abteilungen/100/105/sirna.html). One key
1016 feature in this early set of rules was the suggestion to look for
1017 specific sequence motifs to find the most appropriate siRNA targets.
1018 This original set of rules suggested that siRNA target sites with the
1019 sequence AA(N19)TT (Example 3) are the most appropriate target sequence.
1020 If an appropriate sequence could not be found, an AA(N21) (Example 1) or
1021 CA(N21) sequence motif could be considered. In these situations,
1022 because the sense strand does not appear to be involved in target
1023 recognition, the sequence of the sense strand can be synthesized with a
1024 dTdT regardless of the target sequence.
1025
1026 A number of laboratories have used these rules to create siRNA
1027 oligonucleotides that successfully silenced target gene expression.
1028 However, based on feedback from multiple laboratories, it does not
1029 appear that there is a strict need for the siRNA target sequence to
1030 begin with AA.
1031
1032 Based on the results of an extensive study of a single gene, in which
1033 approximately 70 different siRNA oligonucleotide target sequences were
1034 used for RNAi, we suggest a modified set of rules (see below). The
1035 results of this recent work, indicate that choosing a 21 nt region of
1036 the mRNA with a GC content as close as possible to 50% is a more
1037 important consideration than choosing a target sequence that begins with
1038 AA. Another key consideration in target selection is to avoid having
1039 more than three guanosines in a row, since poly G sequences can
1040 hyperstack and form agglomerates that potentially interfere with the
1041 siRNA silencing mechanism. It is recommended that these two rules be a
1042 primary consideration when designing and siRNA.
1043
1044 There are some advantages to using a target sequence that begins with
1045 AA. The use of a target sequence that begins with an AA allows dT
1046 residues to be used to create the 3'-end of the siRNA, this simplifies
1047 the synthesis and decreases the cost of the siRNA. Although duplexes
1048 synthesized with UU overhangs appear to work as well as dTdT overhangs,
1049 oligonucleotides with a dTdT overhang cost less and are potentially more
1050 resistant to nucleases. However, if the GC content is higher than 60%
1051 alternate regions of the coding region of the mRNA should be considered.
1052 Our suggestions for criteria to consider when designing an siRNA
1053 oligonucleotide are summarized in the "new relaxed set of rules" below.
1054 In addition, examples of siRNA oligonucleotides that start with an AA
1055 sequence as well as examples of siRNA oligonucleotides that do not are
1056 given.
1057
1058 Current Considerations for siRNA Design (19 complementary bases and 2
1059 bases overhang)
1060
1061 1.Choose a 21 (Example 1) or a 23 (Example 2 and 3) nt sequence in the
1062 coding region of the mRNA with a GC ratio as close to 50% as possible.
1063 Ideally the GC ratio will be between 45% and 55%. An siRNA with 60% GC
1064 content has worked in many cases, however when an siRNA with 70% GC
1065 content is used for RNAi typically a sharp decrease in the level of
1066 silencing is observed. Avoid regions within 50-100 nt of the AUG start
1067 codon or 50-100 nt of the termination codon.
1068
1069 2.Avoid more than three guanosines in a row. Poly G sequences can
1070 hyperstack and therefore form agglomerates that potentially interfere in
1071 the siRNA silencing mechanism.
1072
1073 3.Preferentially choose target sequences that start with two adenosines
1074 (Example 1). This will make synthesis easier, more economical, and
1075 create siRNA that is potentially more resistant to nucleases. When a
1076 sequence that starts with AA is used, siRNA with dTdT overhangs can be
1077 produced. If it is not possible to find a sequence that starts with AA
1078 and matches rules 1and 2, choose any 23 nt region of the coding sequence
1079 with a GC content between 45 and 55% that does not have more than three
1080 guanosines in a row (Example 3).
1081
1082 4.Ensure that your target sequence is not homologous to any other genes.
1083 It is strongly recommended that a BLAST search of the target sequence be
1084 performed to prevent the silencing of unwanted genes with a similar
1085 sequence.
1086
1087 5.Based on feedback from various customers, labelling the 3'-end of the
1088 sense strand gives the best results with respect to not interfering with
1089 the gene silencing mechanism of siRNA. Consult our custom siRNA page
1090 for available modifications.
1091
1092 When these rules are used for siRNA target sequence design RNAi
1093 effectively silences genes in more than 80% of cases. Current data
1094 indicate that there are regions of some mRNAs where gene silencing does
1095 not work. To help ensure that a given target gene is silenced, it is
1096 advised that at least two target sequences as far apart on the gene as
1097 possible be chosen.
1098
1099 This method does not take into account mRNA secondary structure. At
1100 present it does not appear that mRNA secondary structure has a
1101 significant impact on gene silencing.
1102
1103
1104 */
1105