1#!/usr/bin/env bash
2
3# Creates a set of files that map records between GENCODE and RefSeq.
4# Pulled directly from the ensemble database.
5# Currently not used by Funcotator.
6
7outFileBaseName="gencode_xrefseq"
8outExt=".tsv"
9
10hg19db="homo_sapiens_core_75_37"
11hg38db="homo_sapiens_core_90_38"
12
13hg19FileName=${outFileBaseName}_v75_37.hg19${outExt}
14hg38FileName=${outFileBaseName}_v90_38.hg38${outExt}
15
16################################################################################
17
18
19# Create our query to the DB:
20read -r -d '' query <<- ENDOFQUERYINPUT 
21SELECT mrna.transcript_id as transcript_id, mRNA_id, prot_acc FROM 
22	(
23		SELECT CONCAT(transcript.stable_id, '.', transcript.version) AS transcript_id, xref.display_label AS mRNA_id  
24			FROM transcript, object_xref, xref, external_db
25			WHERE 
26					transcript.transcript_id = object_xref.ensembl_id AND 
27					object_xref.ensembl_object_type = 'Transcript'    AND 
28					object_xref.xref_id = xref.xref_id                AND 
29					xref.external_db_id = external_db.external_db_id  AND 
30					external_db.db_name = 'RefSeq_mRNA'
31	) AS mrna
32	JOIN
33	(
34		SELECT CONCAT(transcript.stable_id, '.', transcript.version) AS transcript_id, xref.display_label AS prot_acc 
35			FROM translation, transcript, object_xref, xref,external_db
36			WHERE
37				(
38					transcript.transcript_id = translation.transcript_id  AND 
39					translation.translation_id = object_xref.ensembl_id  AND 
40					object_xref.ensembl_object_type = 'Translation'      AND 
41					object_xref.xref_id = xref.xref_id                   AND 
42					xref.external_db_id = external_db.external_db_id     AND 
43					external_db.db_name = 'RefSeq_peptide'
44				)
45	) AS prot
46	ON mrna.transcript_id = prot.transcript_id 
47;
48ENDOFQUERYINPUT
49
50echo "Getting HG19 gencode <=> refseq..."
51echo -e "transcript_id\tmRNA_id\tprot_acc" > ${hg19FileName}
52time mysql -u anonymous -h ensembldb.ensembl.org -e "use ${hg19db};${query}" | tail -n +2 | sort -n -k1 >> ${hg19FileName}
53
54echo "Getting HG38 gencode <=> refseq..."
55echo -e "transcript_id\tmRNA_id\tprot_acc" > ${hg38FileName}
56time mysql -u anonymous -h ensembldb.ensembl.org -e "use ${hg38db};${query}" | tail -n +2 | sort -n -k1 >> ${hg38FileName}
57
58echo 'Done!'
59
60
61