1#!/usr/local/bin/bash
2
3set -ev
4
5export PERL_HASH_SEED=0
6
7if [ ! -e test.genome.fasta ]; then
8    gunzip -c test.genome.fasta.gz > test.genome.fasta
9fi
10
11
12if [ ! -e transcripts.gtf ]; then
13    gunzip -c transcripts.gtf.gz > transcripts.gtf
14fi
15
16if [ ! -e mini_Pfam-A.hmm ]; then
17    gunzip -c mini_Pfam-A.hmm.gz > mini_Pfam-A.hmm
18fi
19
20if [ ! -e mini_sprot.db.pep ]; then
21    gunzip -c mini_sprot.db.pep.gz > mini_sprot.db.pep
22fi
23
24
25## generate alignment gff3 formatted output
26../../util/gtf_to_alignment_gff3.pl transcripts.gtf > transcripts.gff3
27
28## generate transcripts fasta file
29../../util/gtf_genome_to_cdna_fasta.pl transcripts.gtf test.genome.fasta > transcripts.fasta
30
31## Extract the long ORFs
32../../TransDecoder.LongOrfs -t transcripts.fasta
33
34cmd=""
35## Predict likely ORFs
36if [ "$1" == "" ]; then   # always doing this now.
37    # just coding metrics
38    cmd="../../TransDecoder.Predict -t transcripts.fasta"
39
40else
41
42    # this is how I would have run blast and pfam but I'm using precomputed results for ease of demonstration.
43    #BLASTDB=/seq/RNASEQ/DBs/TRINOTATE_RESOURCES/TRINOTATE_V3/uniprot_sprot.pep
44    #PFAMDB=/seq/RNASEQ/DBs/TRINOTATE_RESOURCES/TRINOTATE_V3/Pfam-A.hmm
45    #
46    ## run blast
47    #blastp -query transcripts.fasta.transdecoder_dir/longest_orfs.pep -db $BLASTDB -max_target_seqs 1 -outfmt 6 -evalue 1e-5 > blastp.outfmt6
48
49    makeblastdb -in mini_sprot.db.pep -dbtype prot
50    blastp -query transcripts.fasta.transdecoder_dir/longest_orfs.pep -db mini_sprot.db.pep -max_target_seqs 1 -outfmt 6 -evalue 1e-5 > blastp.outfmt6
51
52    #
53    ## run pfam
54    #hmmscan --domtblout pfam.domtblout $PFAMDB transcripts.fasta.transdecoder_dir/longest_orfs.pep > pfam.log
55
56    hmmpress -f mini_Pfam-A.hmm
57    hmmscan --domtblout pfam.domtblout mini_Pfam-A.hmm transcripts.fasta.transdecoder_dir/longest_orfs.pep
58
59    ## use pfam and blast results:
60    cmd="../../TransDecoder.Predict  -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.outfmt6   -v"
61
62fi
63
64eval $cmd
65
66
67## convert to genome coordinates
68../../util/cdna_alignment_orf_to_genome_orf.pl transcripts.fasta.transdecoder.gff3 transcripts.gff3 transcripts.fasta > transcripts.fasta.transdecoder.genome.gff3
69
70
71## make bed files for viewing with GenomeView
72
73# covert cufflinks gtf to bed
74../../util/gtf_to_bed.pl transcripts.gtf > transcripts.bed
75
76# convert the genome-based gene-gff3 file to bed
77../../util/gff3_file_to_bed.pl transcripts.fasta.transdecoder.genome.gff3 > transcripts.fasta.transdecoder.genome.bed
78
79
80# ensure no fatal problems w/ pep file
81../../util/fasta_prot_checker.pl transcripts.fasta.transdecoder.pep
82
83# Done!  Coding region genome annotations provided as: transcripts.fasta.transdecoder.genome.\*
84
85
86exit 0
87