1#!/usr/local/bin/bash 2 3set -ev 4 5export PERL_HASH_SEED=0 6 7if [ ! -e test.genome.fasta ]; then 8 gunzip -c test.genome.fasta.gz > test.genome.fasta 9fi 10 11 12if [ ! -e transcripts.gtf ]; then 13 gunzip -c transcripts.gtf.gz > transcripts.gtf 14fi 15 16if [ ! -e mini_Pfam-A.hmm ]; then 17 gunzip -c mini_Pfam-A.hmm.gz > mini_Pfam-A.hmm 18fi 19 20if [ ! -e mini_sprot.db.pep ]; then 21 gunzip -c mini_sprot.db.pep.gz > mini_sprot.db.pep 22fi 23 24 25## generate alignment gff3 formatted output 26../../util/gtf_to_alignment_gff3.pl transcripts.gtf > transcripts.gff3 27 28## generate transcripts fasta file 29../../util/gtf_genome_to_cdna_fasta.pl transcripts.gtf test.genome.fasta > transcripts.fasta 30 31## Extract the long ORFs 32../../TransDecoder.LongOrfs -t transcripts.fasta 33 34cmd="" 35## Predict likely ORFs 36if [ "$1" == "" ]; then # always doing this now. 37 # just coding metrics 38 cmd="../../TransDecoder.Predict -t transcripts.fasta" 39 40else 41 42 # this is how I would have run blast and pfam but I'm using precomputed results for ease of demonstration. 43 #BLASTDB=/seq/RNASEQ/DBs/TRINOTATE_RESOURCES/TRINOTATE_V3/uniprot_sprot.pep 44 #PFAMDB=/seq/RNASEQ/DBs/TRINOTATE_RESOURCES/TRINOTATE_V3/Pfam-A.hmm 45 # 46 ## run blast 47 #blastp -query transcripts.fasta.transdecoder_dir/longest_orfs.pep -db $BLASTDB -max_target_seqs 1 -outfmt 6 -evalue 1e-5 > blastp.outfmt6 48 49 makeblastdb -in mini_sprot.db.pep -dbtype prot 50 blastp -query transcripts.fasta.transdecoder_dir/longest_orfs.pep -db mini_sprot.db.pep -max_target_seqs 1 -outfmt 6 -evalue 1e-5 > blastp.outfmt6 51 52 # 53 ## run pfam 54 #hmmscan --domtblout pfam.domtblout $PFAMDB transcripts.fasta.transdecoder_dir/longest_orfs.pep > pfam.log 55 56 hmmpress -f mini_Pfam-A.hmm 57 hmmscan --domtblout pfam.domtblout mini_Pfam-A.hmm transcripts.fasta.transdecoder_dir/longest_orfs.pep 58 59 ## use pfam and blast results: 60 cmd="../../TransDecoder.Predict -t transcripts.fasta --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.outfmt6 -v" 61 62fi 63 64eval $cmd 65 66 67## convert to genome coordinates 68../../util/cdna_alignment_orf_to_genome_orf.pl transcripts.fasta.transdecoder.gff3 transcripts.gff3 transcripts.fasta > transcripts.fasta.transdecoder.genome.gff3 69 70 71## make bed files for viewing with GenomeView 72 73# covert cufflinks gtf to bed 74../../util/gtf_to_bed.pl transcripts.gtf > transcripts.bed 75 76# convert the genome-based gene-gff3 file to bed 77../../util/gff3_file_to_bed.pl transcripts.fasta.transdecoder.genome.gff3 > transcripts.fasta.transdecoder.genome.bed 78 79 80# ensure no fatal problems w/ pep file 81../../util/fasta_prot_checker.pl transcripts.fasta.transdecoder.pep 82 83# Done! Coding region genome annotations provided as: transcripts.fasta.transdecoder.genome.\* 84 85 86exit 0 87