1version 1.0
2
3import "../cnv_common_tasks.wdl" as CNVTasks
4import "https://raw.githubusercontent.com/broadinstitute/gatk-sv/v0.10-beta/wdl/AnnotateChromosome.wdl" as AnnotateVcf
5
6workflow JointCallExomeCNVs {
7
8    ##################################
9    #### required basic arguments ####
10    ##################################
11    input {
12      Int num_samples_per_scatter_block
13      File intervals
14      File? blacklist_intervals
15
16	  File contig_ploidy_calls_tar_path_list
17      File gcnv_calls_tars_path_list
18      File genotyped_intervals_vcf_indexes_path_list
19      File genotyped_intervals_vcfs_path_list
20      File genotyped_segments_vcf_indexes_path_list
21      File genotyped_segments_vcfs_path_list
22
23      #qc arguments
24      Int maximum_number_events
25      Int maximum_number_pass_events
26
27      Array[File] gcnv_model_tars
28      Array[File] calling_configs
29      Array[File] denoising_configs
30      Array[File] gcnvkernel_version
31      Array[File] sharded_interval_lists
32      Array[String]? allosomal_contigs
33      Int ref_copy_number_autosomal_contigs
34      File ref_fasta_dict
35      File ref_fasta_fai
36      File ref_fasta
37      String x_contig_name
38      File  protein_coding_gtf
39      File  linc_rna_gtf
40      File  promoter_bed
41      File  noncoding_bed
42      String gatk_docker
43      String gatk_docker_clustering
44      String gatk_docker_qual_calc
45      String sv_pipeline_docker
46    }
47
48    #we do these as FoFNs for Terra compatibility
49    Array[File] contig_ploidy_calls_tars = read_lines(contig_ploidy_calls_tar_path_list)
50    Array[File] segments_vcfs = read_lines(genotyped_segments_vcfs_path_list)
51    Array[File] segments_vcf_indexes = read_lines(genotyped_segments_vcf_indexes_path_list)
52    Array[File] intervals_vcfs = read_lines(genotyped_intervals_vcfs_path_list)
53    Array[File] intervals_vcf_indexes = read_lines(genotyped_intervals_vcf_indexes_path_list)
54    Array[Array[File]] gcnv_calls_tars = read_tsv(gcnv_calls_tars_path_list)
55    Array[Array[File]] call_tars_sample_by_shard = transpose(gcnv_calls_tars)
56
57    #create a ped file to use for allosome copy number (e.g. XX, XY)
58    call MakePedFile {
59      input:
60        contig_ploidy_calls_tar = read_lines(contig_ploidy_calls_tar_path_list),
61        x_contig_name = x_contig_name
62    }
63
64    call CNVTasks.SplitInputArray as SplitSegmentsVcfsList {
65        input:
66            input_array = segments_vcfs,
67            num_inputs_in_scatter_block = num_samples_per_scatter_block,
68            gatk_docker = gatk_docker
69    }
70
71    call CNVTasks.SplitInputArray as SplitSegmentsIndexesList {
72        input:
73            input_array = segments_vcf_indexes,
74            num_inputs_in_scatter_block = num_samples_per_scatter_block,
75            gatk_docker = gatk_docker
76    }
77
78    Array[Array[String]] split_segments = SplitSegmentsVcfsList.split_array
79    Array[Array[String]] split_segments_indexes = SplitSegmentsIndexesList.split_array
80
81    #for more than num_samples_per_scatter_block, do an intermediate combine first
82    if (length(split_segments) > 1) {
83      scatter (subarray_index in range(length(split_segments))) {
84        call JointSegmentation as ScatterJointSegmentation {
85          input:
86            segments_vcfs = split_segments[subarray_index],
87            segments_vcf_indexes = split_segments_indexes[subarray_index],
88            ped_file = MakePedFile.ped_file,
89            ref_fasta = ref_fasta,
90            ref_fasta_fai = ref_fasta_fai,
91            ref_fasta_dict = ref_fasta_dict,
92            gatk_docker = gatk_docker_clustering,
93            model_intervals = intervals
94        }
95      }
96    }
97
98    #refine breakpoints over all samples
99    call JointSegmentation as GatherJointSegmentation {
100      input:
101        segments_vcfs = select_first([ScatterJointSegmentation.clustered_vcf, segments_vcfs]),
102        segments_vcf_indexes = select_first([ScatterJointSegmentation.clustered_vcf_index, segments_vcfs]),
103        ped_file = MakePedFile.ped_file,
104        ref_fasta = ref_fasta,
105        ref_fasta_fai = ref_fasta_fai,
106        ref_fasta_dict = ref_fasta_dict,
107        gatk_docker = gatk_docker_clustering,
108        model_intervals = intervals
109    }
110
111    #recalculate each sample's quality scores based on new breakpoints and filter low QS or high AF events;
112    #exclude samples with too many events
113    scatter (scatter_index in range(length(segments_vcfs))) {
114      call CNVTasks.PostprocessGermlineCNVCalls as RecalcQual {
115        input:
116              entity_id = sub(sub(basename(intervals_vcfs[scatter_index]), ".vcf.gz", ""), "intervals_output_", ""),
117              gcnv_calls_tars = call_tars_sample_by_shard[scatter_index],
118              gcnv_model_tars = gcnv_model_tars,
119              calling_configs = calling_configs,
120              denoising_configs = denoising_configs,
121              gcnvkernel_version = gcnvkernel_version,
122              sharded_interval_lists = sharded_interval_lists,
123              contig_ploidy_calls_tar = read_lines(contig_ploidy_calls_tar_path_list)[0],  #this is always a list of one tar
124              allosomal_contigs = allosomal_contigs,
125              ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
126              sample_index = scatter_index,
127              intervals_vcf = intervals_vcfs[scatter_index],
128              intervals_vcf_index = intervals_vcf_indexes[scatter_index],
129              clustered_vcf = GatherJointSegmentation.clustered_vcf,
130              clustered_vcf_index = GatherJointSegmentation.clustered_vcf_index,
131              gatk_docker = gatk_docker_qual_calc
132      }
133       call CNVTasks.CollectSampleQualityMetrics as SampleQC {
134        input:
135          genotyped_segments_vcf = RecalcQual.genotyped_segments_vcf,
136          entity_id = sub(sub(basename(intervals_vcfs[scatter_index]), ".vcf.gz", ""), "intervals_output_", ""),
137          maximum_number_events = maximum_number_events,
138          maximum_number_pass_events = maximum_number_pass_events
139      }
140    }
141
142    #only put samples that passed QC into the combined VCF
143    scatter(idx in range(length(RecalcQual.genotyped_segments_vcf))) {
144      if (SampleQC.qc_status_string[idx] == "PASS") {
145        String subset = RecalcQual.genotyped_segments_vcf[idx]
146        String subset_indexes = RecalcQual.genotyped_segments_vcf_index[idx]
147      }
148      if (SampleQC.qc_status_string[idx] != "PASS") {
149        String failed = sub(sub(basename(RecalcQual.genotyped_segments_vcf[idx]), ".vcf.gz", ""), "segments_output_", "")
150      }
151    }
152    Array[String] subset_arr = select_all(subset)
153    Array[String] subset_index_arr = select_all(subset_indexes)
154    Array[String] failed_samples = select_all(failed)
155
156    call FastCombine {
157      input:
158        input_vcfs = subset_arr,
159        input_vcf_indexes = subset_index_arr,
160        sv_pipeline_docker = sv_pipeline_docker
161    }
162
163    # this annotates any vcf -- for exomes we can do all chromosomes at once
164    call AnnotateVcf.AnnotateChromosome as Annotate {
165        input:
166          prefix = "combined.annotated",
167          vcf = FastCombine.combined_vcf,
168          protein_coding_gtf = protein_coding_gtf,
169          linc_rna_gtf = linc_rna_gtf,
170          promoter_bed = promoter_bed,
171          noncoding_bed = noncoding_bed,
172          sv_pipeline_docker = sv_pipeline_docker
173    }
174
175    output {
176      File combined_calls = FastCombine.combined_vcf
177      File combined_calls_index = FastCombine.combined_vcf_index
178      File annotated_vcf = Annotate.annotated_vcf
179      File annotated_vcf_index = Annotate.annotated_vcf_idx
180      Array[String] sample_qc_status_strings = SampleQC.qc_status_string
181      Array[String] failed_samples = failed_samples
182    }
183}
184
185task MakePedFile {
186  input {
187    Array[File] contig_ploidy_calls_tar
188    String x_contig_name
189  }
190
191  command <<<
192    set -e
193
194    while read tar
195    do
196      mkdir callsDir
197      tar -xf $tar -C callsDir
198
199      for sample in $(ls -d -1 callsDir/SAMPLE*)
200      do
201        sample_name=$(cat $sample/sample_name.txt)
202        x_ploidy=$(grep ^~{x_contig_name} $sample/contig_ploidy.tsv | cut -f 2)
203        [[ -z "$x_ploidy" ]] && { echo "Chromosome ~{x_contig_name} ploidy call not found for sample " $sample_name; exit 1; }
204        printf "%s\t%s\t0\t0\t%s\t0\n" $sample_name $sample_name $x_ploidy >> cohort.ped
205      done
206      rm -rf callsDir
207    done < ~{write_lines(contig_ploidy_calls_tar)}
208    >>>
209
210    output {
211      File ped_file = "cohort.ped"
212    }
213
214    runtime {
215      docker: "gatksv/sv-base-mini:b3af2e3"
216      memory: "3000 MB"
217      disks: "local-disk 100 SSD"
218      cpu: 1
219      preemptible: 2
220    }
221}
222
223task JointSegmentation {
224  input {
225    Array[File] segments_vcfs
226    Array[File] segments_vcf_indexes
227    File ped_file
228    File ref_fasta_dict
229    File ref_fasta_fai
230    File ref_fasta
231    File model_intervals
232
233     # Runtime parameters
234    String gatk_docker
235    Int? mem_gb
236    Int? disk_space_gb
237    Boolean use_ssd = false
238    Int? cpu
239    Int? preemptible_attempts
240    }
241
242    parameter_meta {
243      segments_vcfs: {localization_optional: true}
244      segments_vcf_indexes: {localization_optional: true}
245    }
246
247    Int machine_mem_mb = select_first([mem_gb, 2]) * 1000
248    Int command_mem_mb = machine_mem_mb - 500
249
250  #NOTE: output has to be gzipped to be read in by pyvcf in the next step
251  command <<<
252    set -e
253    gatk --java-options "-Xmx~{command_mem_mb}m" JointGermlineCNVSegmentation \
254    -R ~{ref_fasta} -O clustered.vcf.gz -V ~{sep=' -V ' segments_vcfs} --model-call-intervals ~{model_intervals} -ped ~{ped_file}
255    >>>
256
257    output {
258      File clustered_vcf = "clustered.vcf.gz"
259      File clustered_vcf_index = "clustered.vcf.gz.tbi"
260    }
261
262    runtime {
263      docker: gatk_docker
264      memory: machine_mem_mb + " MB"
265      disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD"
266      cpu: select_first([cpu, 1])
267      preemptible: select_first([preemptible_attempts, 2])
268    }
269}
270
271task FastCombine {
272  input {
273    Array[File] input_vcfs
274    Array[File] input_vcf_indexes
275    String sv_pipeline_docker
276    Int? preemptible_tries
277    Int? disk_size
278    Float? mem_gb
279  }
280
281  command <<<
282  #bcftools gets pissy if the indexes look older than their VCFs
283  index_fofn=~{write_lines(input_vcf_indexes)}
284  while read index; do touch $index; done < $index_fofn
285
286  bcftools merge -l ~{write_lines(input_vcfs)} -o combined.vcf.gz -O z --threads 4 -m all -0
287
288  tabix combined.vcf.gz
289  >>>
290
291  output {
292    File combined_vcf = "combined.vcf.gz"
293    File combined_vcf_index = "combined.vcf.gz.tbi"
294  }
295
296  runtime {
297    docker: sv_pipeline_docker
298    preemptible: select_first([preemptible_tries, 2])
299    memory: select_first([mem_gb, 3.5]) + " GiB"
300    cpu: "1"
301    disks: "local-disk " + select_first([disk_size, 50]) + " HDD"
302  }
303}