1version 1.0 2 3import "../cnv_common_tasks.wdl" as CNVTasks 4import "https://raw.githubusercontent.com/broadinstitute/gatk-sv/v0.10-beta/wdl/AnnotateChromosome.wdl" as AnnotateVcf 5 6workflow JointCallExomeCNVs { 7 8 ################################## 9 #### required basic arguments #### 10 ################################## 11 input { 12 Int num_samples_per_scatter_block 13 File intervals 14 File? blacklist_intervals 15 16 File contig_ploidy_calls_tar_path_list 17 File gcnv_calls_tars_path_list 18 File genotyped_intervals_vcf_indexes_path_list 19 File genotyped_intervals_vcfs_path_list 20 File genotyped_segments_vcf_indexes_path_list 21 File genotyped_segments_vcfs_path_list 22 23 #qc arguments 24 Int maximum_number_events 25 Int maximum_number_pass_events 26 27 Array[File] gcnv_model_tars 28 Array[File] calling_configs 29 Array[File] denoising_configs 30 Array[File] gcnvkernel_version 31 Array[File] sharded_interval_lists 32 Array[String]? allosomal_contigs 33 Int ref_copy_number_autosomal_contigs 34 File ref_fasta_dict 35 File ref_fasta_fai 36 File ref_fasta 37 String x_contig_name 38 File protein_coding_gtf 39 File linc_rna_gtf 40 File promoter_bed 41 File noncoding_bed 42 String gatk_docker 43 String gatk_docker_clustering 44 String gatk_docker_qual_calc 45 String sv_pipeline_docker 46 } 47 48 #we do these as FoFNs for Terra compatibility 49 Array[File] contig_ploidy_calls_tars = read_lines(contig_ploidy_calls_tar_path_list) 50 Array[File] segments_vcfs = read_lines(genotyped_segments_vcfs_path_list) 51 Array[File] segments_vcf_indexes = read_lines(genotyped_segments_vcf_indexes_path_list) 52 Array[File] intervals_vcfs = read_lines(genotyped_intervals_vcfs_path_list) 53 Array[File] intervals_vcf_indexes = read_lines(genotyped_intervals_vcf_indexes_path_list) 54 Array[Array[File]] gcnv_calls_tars = read_tsv(gcnv_calls_tars_path_list) 55 Array[Array[File]] call_tars_sample_by_shard = transpose(gcnv_calls_tars) 56 57 #create a ped file to use for allosome copy number (e.g. XX, XY) 58 call MakePedFile { 59 input: 60 contig_ploidy_calls_tar = read_lines(contig_ploidy_calls_tar_path_list), 61 x_contig_name = x_contig_name 62 } 63 64 call CNVTasks.SplitInputArray as SplitSegmentsVcfsList { 65 input: 66 input_array = segments_vcfs, 67 num_inputs_in_scatter_block = num_samples_per_scatter_block, 68 gatk_docker = gatk_docker 69 } 70 71 call CNVTasks.SplitInputArray as SplitSegmentsIndexesList { 72 input: 73 input_array = segments_vcf_indexes, 74 num_inputs_in_scatter_block = num_samples_per_scatter_block, 75 gatk_docker = gatk_docker 76 } 77 78 Array[Array[String]] split_segments = SplitSegmentsVcfsList.split_array 79 Array[Array[String]] split_segments_indexes = SplitSegmentsIndexesList.split_array 80 81 #for more than num_samples_per_scatter_block, do an intermediate combine first 82 if (length(split_segments) > 1) { 83 scatter (subarray_index in range(length(split_segments))) { 84 call JointSegmentation as ScatterJointSegmentation { 85 input: 86 segments_vcfs = split_segments[subarray_index], 87 segments_vcf_indexes = split_segments_indexes[subarray_index], 88 ped_file = MakePedFile.ped_file, 89 ref_fasta = ref_fasta, 90 ref_fasta_fai = ref_fasta_fai, 91 ref_fasta_dict = ref_fasta_dict, 92 gatk_docker = gatk_docker_clustering, 93 model_intervals = intervals 94 } 95 } 96 } 97 98 #refine breakpoints over all samples 99 call JointSegmentation as GatherJointSegmentation { 100 input: 101 segments_vcfs = select_first([ScatterJointSegmentation.clustered_vcf, segments_vcfs]), 102 segments_vcf_indexes = select_first([ScatterJointSegmentation.clustered_vcf_index, segments_vcfs]), 103 ped_file = MakePedFile.ped_file, 104 ref_fasta = ref_fasta, 105 ref_fasta_fai = ref_fasta_fai, 106 ref_fasta_dict = ref_fasta_dict, 107 gatk_docker = gatk_docker_clustering, 108 model_intervals = intervals 109 } 110 111 #recalculate each sample's quality scores based on new breakpoints and filter low QS or high AF events; 112 #exclude samples with too many events 113 scatter (scatter_index in range(length(segments_vcfs))) { 114 call CNVTasks.PostprocessGermlineCNVCalls as RecalcQual { 115 input: 116 entity_id = sub(sub(basename(intervals_vcfs[scatter_index]), ".vcf.gz", ""), "intervals_output_", ""), 117 gcnv_calls_tars = call_tars_sample_by_shard[scatter_index], 118 gcnv_model_tars = gcnv_model_tars, 119 calling_configs = calling_configs, 120 denoising_configs = denoising_configs, 121 gcnvkernel_version = gcnvkernel_version, 122 sharded_interval_lists = sharded_interval_lists, 123 contig_ploidy_calls_tar = read_lines(contig_ploidy_calls_tar_path_list)[0], #this is always a list of one tar 124 allosomal_contigs = allosomal_contigs, 125 ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, 126 sample_index = scatter_index, 127 intervals_vcf = intervals_vcfs[scatter_index], 128 intervals_vcf_index = intervals_vcf_indexes[scatter_index], 129 clustered_vcf = GatherJointSegmentation.clustered_vcf, 130 clustered_vcf_index = GatherJointSegmentation.clustered_vcf_index, 131 gatk_docker = gatk_docker_qual_calc 132 } 133 call CNVTasks.CollectSampleQualityMetrics as SampleQC { 134 input: 135 genotyped_segments_vcf = RecalcQual.genotyped_segments_vcf, 136 entity_id = sub(sub(basename(intervals_vcfs[scatter_index]), ".vcf.gz", ""), "intervals_output_", ""), 137 maximum_number_events = maximum_number_events, 138 maximum_number_pass_events = maximum_number_pass_events 139 } 140 } 141 142 #only put samples that passed QC into the combined VCF 143 scatter(idx in range(length(RecalcQual.genotyped_segments_vcf))) { 144 if (SampleQC.qc_status_string[idx] == "PASS") { 145 String subset = RecalcQual.genotyped_segments_vcf[idx] 146 String subset_indexes = RecalcQual.genotyped_segments_vcf_index[idx] 147 } 148 if (SampleQC.qc_status_string[idx] != "PASS") { 149 String failed = sub(sub(basename(RecalcQual.genotyped_segments_vcf[idx]), ".vcf.gz", ""), "segments_output_", "") 150 } 151 } 152 Array[String] subset_arr = select_all(subset) 153 Array[String] subset_index_arr = select_all(subset_indexes) 154 Array[String] failed_samples = select_all(failed) 155 156 call FastCombine { 157 input: 158 input_vcfs = subset_arr, 159 input_vcf_indexes = subset_index_arr, 160 sv_pipeline_docker = sv_pipeline_docker 161 } 162 163 # this annotates any vcf -- for exomes we can do all chromosomes at once 164 call AnnotateVcf.AnnotateChromosome as Annotate { 165 input: 166 prefix = "combined.annotated", 167 vcf = FastCombine.combined_vcf, 168 protein_coding_gtf = protein_coding_gtf, 169 linc_rna_gtf = linc_rna_gtf, 170 promoter_bed = promoter_bed, 171 noncoding_bed = noncoding_bed, 172 sv_pipeline_docker = sv_pipeline_docker 173 } 174 175 output { 176 File combined_calls = FastCombine.combined_vcf 177 File combined_calls_index = FastCombine.combined_vcf_index 178 File annotated_vcf = Annotate.annotated_vcf 179 File annotated_vcf_index = Annotate.annotated_vcf_idx 180 Array[String] sample_qc_status_strings = SampleQC.qc_status_string 181 Array[String] failed_samples = failed_samples 182 } 183} 184 185task MakePedFile { 186 input { 187 Array[File] contig_ploidy_calls_tar 188 String x_contig_name 189 } 190 191 command <<< 192 set -e 193 194 while read tar 195 do 196 mkdir callsDir 197 tar -xf $tar -C callsDir 198 199 for sample in $(ls -d -1 callsDir/SAMPLE*) 200 do 201 sample_name=$(cat $sample/sample_name.txt) 202 x_ploidy=$(grep ^~{x_contig_name} $sample/contig_ploidy.tsv | cut -f 2) 203 [[ -z "$x_ploidy" ]] && { echo "Chromosome ~{x_contig_name} ploidy call not found for sample " $sample_name; exit 1; } 204 printf "%s\t%s\t0\t0\t%s\t0\n" $sample_name $sample_name $x_ploidy >> cohort.ped 205 done 206 rm -rf callsDir 207 done < ~{write_lines(contig_ploidy_calls_tar)} 208 >>> 209 210 output { 211 File ped_file = "cohort.ped" 212 } 213 214 runtime { 215 docker: "gatksv/sv-base-mini:b3af2e3" 216 memory: "3000 MB" 217 disks: "local-disk 100 SSD" 218 cpu: 1 219 preemptible: 2 220 } 221} 222 223task JointSegmentation { 224 input { 225 Array[File] segments_vcfs 226 Array[File] segments_vcf_indexes 227 File ped_file 228 File ref_fasta_dict 229 File ref_fasta_fai 230 File ref_fasta 231 File model_intervals 232 233 # Runtime parameters 234 String gatk_docker 235 Int? mem_gb 236 Int? disk_space_gb 237 Boolean use_ssd = false 238 Int? cpu 239 Int? preemptible_attempts 240 } 241 242 parameter_meta { 243 segments_vcfs: {localization_optional: true} 244 segments_vcf_indexes: {localization_optional: true} 245 } 246 247 Int machine_mem_mb = select_first([mem_gb, 2]) * 1000 248 Int command_mem_mb = machine_mem_mb - 500 249 250 #NOTE: output has to be gzipped to be read in by pyvcf in the next step 251 command <<< 252 set -e 253 gatk --java-options "-Xmx~{command_mem_mb}m" JointGermlineCNVSegmentation \ 254 -R ~{ref_fasta} -O clustered.vcf.gz -V ~{sep=' -V ' segments_vcfs} --model-call-intervals ~{model_intervals} -ped ~{ped_file} 255 >>> 256 257 output { 258 File clustered_vcf = "clustered.vcf.gz" 259 File clustered_vcf_index = "clustered.vcf.gz.tbi" 260 } 261 262 runtime { 263 docker: gatk_docker 264 memory: machine_mem_mb + " MB" 265 disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" 266 cpu: select_first([cpu, 1]) 267 preemptible: select_first([preemptible_attempts, 2]) 268 } 269} 270 271task FastCombine { 272 input { 273 Array[File] input_vcfs 274 Array[File] input_vcf_indexes 275 String sv_pipeline_docker 276 Int? preemptible_tries 277 Int? disk_size 278 Float? mem_gb 279 } 280 281 command <<< 282 #bcftools gets pissy if the indexes look older than their VCFs 283 index_fofn=~{write_lines(input_vcf_indexes)} 284 while read index; do touch $index; done < $index_fofn 285 286 bcftools merge -l ~{write_lines(input_vcfs)} -o combined.vcf.gz -O z --threads 4 -m all -0 287 288 tabix combined.vcf.gz 289 >>> 290 291 output { 292 File combined_vcf = "combined.vcf.gz" 293 File combined_vcf_index = "combined.vcf.gz.tbi" 294 } 295 296 runtime { 297 docker: sv_pipeline_docker 298 preemptible: select_first([preemptible_tries, 2]) 299 memory: select_first([mem_gb, 3.5]) + " GiB" 300 cpu: "1" 301 disks: "local-disk " + select_first([disk_size, 50]) + " HDD" 302 } 303}