1# Workflow for creating a GATK GermlineCNVCaller denoising model and generating calls given a list of normal samples. Supports both WGS and WES.
2#
3# Notes:
4#
5# - The intervals argument is required for both WGS and WES workflows and accepts formats compatible with the
6#   GATK -L argument (see https://gatkforums.broadinstitute.org/gatk/discussion/11009/intervals-and-interval-lists).
7#   These intervals will be padded on both sides by the amount specified by padding (default 250)
8#   and split into bins of length specified by bin_length (default 1000; specify 0 to skip binning,
9#   e.g., for WES).  For WGS, the intervals should simply cover the chromosomes of interest.
10#
11# - Intervals can be blacklisted from coverage collection and all downstream steps by using the blacklist_intervals
12#   argument, which accepts formats compatible with the GATK -XL argument
13#   (see https://gatkforums.broadinstitute.org/gatk/discussion/11009/intervals-and-interval-lists).
14#   This may be useful for excluding centromeric regions, etc. from analysis.  Alternatively, these regions may
15#   be manually filtered from the final callset.
16#
17# - Example invocation:
18#
19#       java -jar cromwell.jar run cnv_germline_cohort_workflow.wdl -i my_parameters.json
20#
21#############
22
23version 1.0
24
25import "../cnv_common_tasks.wdl" as CNVTasks
26
27workflow CNVGermlineCohortWorkflow {
28
29    input {
30      ##################################
31      #### required basic arguments ####
32      ##################################
33      File intervals
34      File? blacklist_intervals
35      Array[String]+ normal_bams
36      Array[String]+ normal_bais
37      String cohort_entity_id
38      File contig_ploidy_priors
39      Int num_intervals_per_scatter
40      File ref_fasta_dict
41      File ref_fasta_fai
42      File ref_fasta
43      String gatk_docker
44
45      ##################################
46      #### optional basic arguments ####
47      ##################################
48      # If true, AnnotateIntervals will be run to create GC annotations and explicit
49      # GC correction will be performed by the model generated by
50      Boolean? do_explicit_gc_correction
51      File? gatk4_jar_override
52      Int? preemptible_attempts
53
54      # Required if BAM/CRAM is in a requester pays bucket
55      String? gcs_project_for_requester_pays
56
57      ####################################################
58      #### optional arguments for PreprocessIntervals ####
59      ####################################################
60      Int? padding
61      Int? bin_length
62
63      ##################################################
64      #### optional arguments for AnnotateIntervals ####
65      ##################################################
66      File? mappability_track_bed
67      File? mappability_track_bed_idx
68      File? segmental_duplication_track_bed
69      File? segmental_duplication_track_bed_idx
70      Int? feature_query_lookahead
71      Int? mem_gb_for_annotate_intervals
72
73      #################################################
74      #### optional arguments for FilterIntervals ####
75      ################################################
76      File? blacklist_intervals_for_filter_intervals
77      Float? minimum_gc_content
78      Float? maximum_gc_content
79      Float? minimum_mappability
80      Float? maximum_mappability
81      Float? minimum_segmental_duplication_content
82      Float? maximum_segmental_duplication_content
83      Int? low_count_filter_count_threshold
84      Float? low_count_filter_percentage_of_samples
85      Float? extreme_count_filter_minimum_percentile
86      Float? extreme_count_filter_maximum_percentile
87      Float? extreme_count_filter_percentage_of_samples
88      Int? mem_gb_for_filter_intervals
89
90      ##############################################
91      #### optional arguments for CollectCounts ####
92      ##############################################
93      Array[String]? disabled_read_filters_for_collect_counts
94      String? collect_counts_format
95      Boolean? collect_counts_enable_indexing
96      Int? mem_gb_for_collect_counts
97
98      ########################################################################
99      #### optional arguments for DetermineGermlineContigPloidyCohortMode ####
100      ########################################################################
101      Float? ploidy_mean_bias_standard_deviation
102      Float? ploidy_mapping_error_rate
103      Float? ploidy_global_psi_scale
104      Float? ploidy_sample_psi_scale
105      Int? mem_gb_for_determine_germline_contig_ploidy
106      Int? cpu_for_determine_germline_contig_ploidy
107
108      ############################################################
109      #### optional arguments for GermlineCNVCallerCohortMode ####
110      ############################################################
111      Float? gcnv_p_alt
112      Float? gcnv_p_active
113      Float? gcnv_cnv_coherence_length
114      Float? gcnv_class_coherence_length
115      Int? gcnv_max_copy_number
116      Int? mem_gb_for_germline_cnv_caller
117      Int? cpu_for_germline_cnv_caller
118
119      # optional arguments for germline CNV denoising model
120      Int? gcnv_max_bias_factors
121      Float? gcnv_mapping_error_rate
122      Float? gcnv_interval_psi_scale
123      Float? gcnv_sample_psi_scale
124      Float? gcnv_depth_correction_tau
125      Float? gcnv_log_mean_bias_standard_deviation
126      Float? gcnv_init_ard_rel_unexplained_variance
127      Int? gcnv_num_gc_bins
128      Float? gcnv_gc_curve_standard_deviation
129      String? gcnv_copy_number_posterior_expectation_mode
130      Boolean? gcnv_enable_bias_factors
131      Int? gcnv_active_class_padding_hybrid_mode
132
133      # optional arguments for Hybrid ADVI
134      Float? gcnv_learning_rate
135      Float? gcnv_adamax_beta_1
136      Float? gcnv_adamax_beta_2
137      Int? gcnv_log_emission_samples_per_round
138      Float? gcnv_log_emission_sampling_median_rel_error
139      Int? gcnv_log_emission_sampling_rounds
140      Int? gcnv_max_advi_iter_first_epoch
141      Int? gcnv_max_advi_iter_subsequent_epochs
142      Int? gcnv_min_training_epochs
143      Int? gcnv_max_training_epochs
144      Float? gcnv_initial_temperature
145      Int? gcnv_num_thermal_advi_iters
146      Int? gcnv_convergence_snr_averaging_window
147      Float? gcnv_convergence_snr_trigger_threshold
148      Int? gcnv_convergence_snr_countdown_window
149      Int? gcnv_max_calling_iters
150      Float? gcnv_caller_update_convergence_threshold
151      Float? gcnv_caller_internal_admixing_rate
152      Float? gcnv_caller_external_admixing_rate
153      Boolean? gcnv_disable_annealing
154
155      ###################################################
156      #### arguments for PostprocessGermlineCNVCalls ####
157      ###################################################
158      Int ref_copy_number_autosomal_contigs
159      Int? mem_gb_for_postprocess_germline_cnv_calls
160      Int? disk_space_gb_for_postprocess_germline_cnv_calls
161      Array[String]? allosomal_contigs
162
163      ##########################
164      #### arguments for QC ####
165      ##########################
166      Int maximum_number_events_per_sample
167      Int maximum_number_pass_events_per_sample
168    }
169
170    Array[Pair[String, String]] normal_bams_and_bais = zip(normal_bams, normal_bais)
171
172    call CNVTasks.PreprocessIntervals {
173        input:
174            intervals = intervals,
175            blacklist_intervals = blacklist_intervals,
176            ref_fasta = ref_fasta,
177            ref_fasta_fai = ref_fasta_fai,
178            ref_fasta_dict = ref_fasta_dict,
179            padding = padding,
180            bin_length = bin_length,
181            gatk4_jar_override = gatk4_jar_override,
182            gatk_docker = gatk_docker,
183            preemptible_attempts = preemptible_attempts
184    }
185
186    if (select_first([do_explicit_gc_correction, true])) {
187        call CNVTasks.AnnotateIntervals {
188            input:
189                intervals = PreprocessIntervals.preprocessed_intervals,
190                ref_fasta = ref_fasta,
191                ref_fasta_fai = ref_fasta_fai,
192                ref_fasta_dict = ref_fasta_dict,
193                mappability_track_bed = mappability_track_bed,
194                mappability_track_bed_idx = mappability_track_bed_idx,
195                segmental_duplication_track_bed = segmental_duplication_track_bed,
196                segmental_duplication_track_bed_idx = segmental_duplication_track_bed_idx,
197                feature_query_lookahead = feature_query_lookahead,
198                gatk4_jar_override = gatk4_jar_override,
199                gatk_docker = gatk_docker,
200                mem_gb = mem_gb_for_annotate_intervals,
201                preemptible_attempts = preemptible_attempts
202        }
203    }
204
205    scatter (normal_bam_and_bai in normal_bams_and_bais) {
206        call CNVTasks.CollectCounts {
207            input:
208                intervals = PreprocessIntervals.preprocessed_intervals,
209                bam = normal_bam_and_bai.left,
210                bam_idx = normal_bam_and_bai.right,
211                ref_fasta = ref_fasta,
212                ref_fasta_fai = ref_fasta_fai,
213                ref_fasta_dict = ref_fasta_dict,
214                format = collect_counts_format,
215                enable_indexing = collect_counts_enable_indexing,
216                disabled_read_filters = disabled_read_filters_for_collect_counts,
217                gatk4_jar_override = gatk4_jar_override,
218                gatk_docker = gatk_docker,
219                mem_gb = mem_gb_for_collect_counts,
220                preemptible_attempts = preemptible_attempts,
221                gcs_project_for_requester_pays = gcs_project_for_requester_pays
222        }
223    }
224
225    call CNVTasks.FilterIntervals {
226        input:
227            intervals = PreprocessIntervals.preprocessed_intervals,
228            blacklist_intervals = blacklist_intervals_for_filter_intervals,
229            annotated_intervals = AnnotateIntervals.annotated_intervals,
230            read_count_files = CollectCounts.counts,
231            minimum_gc_content = minimum_gc_content,
232            maximum_gc_content = maximum_gc_content,
233            minimum_mappability = minimum_mappability,
234            maximum_mappability = maximum_mappability,
235            minimum_segmental_duplication_content = minimum_segmental_duplication_content,
236            maximum_segmental_duplication_content = maximum_segmental_duplication_content,
237            low_count_filter_count_threshold = low_count_filter_count_threshold,
238            low_count_filter_percentage_of_samples = low_count_filter_percentage_of_samples,
239            extreme_count_filter_minimum_percentile = extreme_count_filter_minimum_percentile,
240            extreme_count_filter_maximum_percentile = extreme_count_filter_maximum_percentile,
241            extreme_count_filter_percentage_of_samples = extreme_count_filter_percentage_of_samples,
242            gatk4_jar_override = gatk4_jar_override,
243            gatk_docker = gatk_docker,
244            mem_gb = mem_gb_for_filter_intervals,
245            preemptible_attempts = preemptible_attempts
246    }
247
248    call DetermineGermlineContigPloidyCohortMode {
249        input:
250            cohort_entity_id = cohort_entity_id,
251            intervals = FilterIntervals.filtered_intervals,
252            read_count_files = CollectCounts.counts,
253            contig_ploidy_priors = contig_ploidy_priors,
254            gatk4_jar_override = gatk4_jar_override,
255            gatk_docker = gatk_docker,
256            mem_gb = mem_gb_for_determine_germline_contig_ploidy,
257            cpu = cpu_for_determine_germline_contig_ploidy,
258            mean_bias_standard_deviation = ploidy_mean_bias_standard_deviation,
259            mapping_error_rate = ploidy_mapping_error_rate,
260            global_psi_scale = ploidy_global_psi_scale,
261            sample_psi_scale = ploidy_sample_psi_scale,
262            preemptible_attempts = preemptible_attempts
263    }
264
265    call CNVTasks.ScatterIntervals {
266        input:
267            interval_list = FilterIntervals.filtered_intervals,
268            num_intervals_per_scatter = num_intervals_per_scatter,
269            gatk_docker = gatk_docker,
270            preemptible_attempts = preemptible_attempts
271    }
272
273    scatter (scatter_index in range(length(ScatterIntervals.scattered_interval_lists))) {
274        call GermlineCNVCallerCohortMode {
275            input:
276                scatter_index = scatter_index,
277                cohort_entity_id = cohort_entity_id,
278                read_count_files = CollectCounts.counts,
279                contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar,
280                intervals = ScatterIntervals.scattered_interval_lists[scatter_index],
281                annotated_intervals = AnnotateIntervals.annotated_intervals,
282                gatk4_jar_override = gatk4_jar_override,
283                gatk_docker = gatk_docker,
284                mem_gb = mem_gb_for_germline_cnv_caller,
285                cpu = cpu_for_germline_cnv_caller,
286                p_alt = gcnv_p_alt,
287                p_active = gcnv_p_active,
288                cnv_coherence_length = gcnv_cnv_coherence_length,
289                class_coherence_length = gcnv_class_coherence_length,
290                max_copy_number = gcnv_max_copy_number,
291                max_bias_factors = gcnv_max_bias_factors,
292                mapping_error_rate = gcnv_mapping_error_rate,
293                interval_psi_scale = gcnv_interval_psi_scale,
294                sample_psi_scale = gcnv_sample_psi_scale,
295                depth_correction_tau = gcnv_depth_correction_tau,
296                log_mean_bias_standard_deviation = gcnv_log_mean_bias_standard_deviation,
297                init_ard_rel_unexplained_variance = gcnv_init_ard_rel_unexplained_variance,
298                num_gc_bins = gcnv_num_gc_bins,
299                gc_curve_standard_deviation = gcnv_gc_curve_standard_deviation,
300                copy_number_posterior_expectation_mode = gcnv_copy_number_posterior_expectation_mode,
301                enable_bias_factors = gcnv_enable_bias_factors,
302                active_class_padding_hybrid_mode = gcnv_active_class_padding_hybrid_mode,
303                learning_rate = gcnv_learning_rate,
304                adamax_beta_1 = gcnv_adamax_beta_1,
305                adamax_beta_2 = gcnv_adamax_beta_2,
306                log_emission_samples_per_round = gcnv_log_emission_samples_per_round,
307                log_emission_sampling_median_rel_error = gcnv_log_emission_sampling_median_rel_error,
308                log_emission_sampling_rounds = gcnv_log_emission_sampling_rounds,
309                max_advi_iter_first_epoch = gcnv_max_advi_iter_first_epoch,
310                max_advi_iter_subsequent_epochs = gcnv_max_advi_iter_subsequent_epochs,
311                min_training_epochs = gcnv_min_training_epochs,
312                max_training_epochs = gcnv_max_training_epochs,
313                initial_temperature = gcnv_initial_temperature,
314                num_thermal_advi_iters = gcnv_num_thermal_advi_iters,
315                convergence_snr_averaging_window = gcnv_convergence_snr_averaging_window,
316                convergence_snr_trigger_threshold = gcnv_convergence_snr_trigger_threshold,
317                convergence_snr_countdown_window = gcnv_convergence_snr_countdown_window,
318                max_calling_iters = gcnv_max_calling_iters,
319                caller_update_convergence_threshold = gcnv_caller_update_convergence_threshold,
320                caller_internal_admixing_rate = gcnv_caller_internal_admixing_rate,
321                caller_external_admixing_rate = gcnv_caller_external_admixing_rate,
322                disable_annealing = gcnv_disable_annealing,
323                preemptible_attempts = preemptible_attempts
324        }
325    }
326
327    Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars)
328
329    scatter (sample_index in range(length(CollectCounts.entity_id))) {
330        call CNVTasks.PostprocessGermlineCNVCalls {
331            input:
332                entity_id = CollectCounts.entity_id[sample_index],
333                gcnv_calls_tars = call_tars_sample_by_shard[sample_index],
334                gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar,
335                calling_configs = GermlineCNVCallerCohortMode.calling_config_json,
336                denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json,
337                gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json,
338                sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list,
339                contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar,
340                allosomal_contigs = allosomal_contigs,
341                ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
342                sample_index = sample_index,
343                gatk4_jar_override = gatk4_jar_override,
344                gatk_docker = gatk_docker,
345                preemptible_attempts = preemptible_attempts
346        }
347
348        call CNVTasks.CollectSampleQualityMetrics {
349            input:
350                genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf,
351                entity_id = CollectCounts.entity_id[sample_index],
352                maximum_number_events = maximum_number_events_per_sample,
353                maximum_number_pass_events = maximum_number_pass_events_per_sample,
354                bash_docker = gatk_docker,
355                preemptible_attempts = preemptible_attempts
356        }
357    }
358
359    call CNVTasks.CollectModelQualityMetrics {
360        input:
361            gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar,
362            gatk_docker = gatk_docker,
363            preemptible_attempts = preemptible_attempts
364    }
365
366    call CNVTasks.ScatterPloidyCallsBySample {
367        input :
368            contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar,
369            samples = CollectCounts.entity_id,
370            docker = gatk_docker,
371            preemptible_attempts = preemptible_attempts
372    }
373
374    call WritePathList as WritePloidyCalls {
375    	input:
376        	file_paths = [DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar],
377            outfile = "contig_ploidy_calls_tar.paths.list"
378    }
379
380    call WritePathMatrix as WriteGCNVCalls {
381    	input:
382            path_matrix = GermlineCNVCallerCohortMode.gcnv_call_tars,
383            outfile = "gcnv_call_tars.paths.list"
384    }
385
386    call WritePathList as WriteSegments {
387    	input:
388        	file_paths = PostprocessGermlineCNVCalls.genotyped_segments_vcf,
389            outfile = "genotyped_segments_vcf.paths.list"
390    }
391
392    call WritePathList as WriteSegmentIndexes {
393    	input:
394        	file_paths = PostprocessGermlineCNVCalls.genotyped_segments_vcf_index,
395            outfile = "genotyped_segments_vcf_index.paths.list"
396    }
397
398    call WritePathList as WriteIntervals {
399    	input:
400        	file_paths = PostprocessGermlineCNVCalls.genotyped_intervals_vcf,
401            outfile = "genotyped_intervals_vcf.paths.list"
402    }
403
404    call WritePathList as WriteIntervalIndexes {
405    	input:
406        	file_paths = PostprocessGermlineCNVCalls.genotyped_intervals_vcf_index,
407            outfile = "genotyped_intervals_vcf_index.paths.list"
408    }
409
410
411    output {
412        File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals
413        Array[File] read_counts_entity_ids = CollectCounts.entity_id
414        Array[File] read_counts = CollectCounts.counts
415        File? annotated_intervals = AnnotateIntervals.annotated_intervals
416        File filtered_intervals = FilterIntervals.filtered_intervals
417        File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar
418        File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar
419        File contig_ploidy_calls_tar_path_list = WritePloidyCalls.path_list
420        Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar
421        Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar
422        Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars
423        File gcnv_calls_tars_path_list = WriteGCNVCalls.path_list
424        Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar
425
426        Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
427        File genotyped_intervals_vcfs_path_list = WriteIntervals.path_list
428        Array[File] genotyped_intervals_vcf_indexes = PostprocessGermlineCNVCalls.genotyped_intervals_vcf_index
429        File genotyped_intervals_vcf_indexes_path_list = WriteIntervalIndexes.path_list
430        Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf
431        File genotyped_segments_vcfs_path_list = WriteSegments.path_list
432        Array[File] genotyped_segments_vcf_indexes = PostprocessGermlineCNVCalls.genotyped_segments_vcf_index
433        File genotyped_segments_vcf_indexes_path_list = WriteSegmentIndexes.path_list
434
435        Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios
436        Array[File] sample_qc_status_files = CollectSampleQualityMetrics.qc_status_file
437        Array[String] sample_qc_status_strings = CollectSampleQualityMetrics.qc_status_string
438        File model_qc_status_file = CollectModelQualityMetrics.qc_status_file
439        String model_qc_string = CollectModelQualityMetrics.qc_status_string
440        Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios
441
442        Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar
443        Array[File] calling_configs = GermlineCNVCallerCohortMode.calling_config_json
444        Array[File] denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json
445        Array[File] gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json
446        Array[File] sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list
447    }
448}
449
450task DetermineGermlineContigPloidyCohortMode {
451    input {
452      String cohort_entity_id
453      File? intervals
454      Array[File] read_count_files
455      File contig_ploidy_priors
456      String? output_dir
457      File? gatk4_jar_override
458
459      # Runtime parameters
460      String gatk_docker
461      Int? mem_gb
462      Int? disk_space_gb
463      Boolean use_ssd = false
464      Int? cpu
465      Int? preemptible_attempts
466
467      # Model parameters
468      Float? mean_bias_standard_deviation
469      Float? mapping_error_rate
470      Float? global_psi_scale
471      Float? sample_psi_scale
472    }
473
474    # We do not expose Hybrid ADVI parameters -- the default values are decent
475
476    Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
477    Int command_mem_mb = machine_mem_mb - 500
478
479    # If optional output_dir not specified, use "out"
480    String output_dir_ = select_first([output_dir, "out"])
481
482    command <<<
483        set -eu
484        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}
485        export MKL_NUM_THREADS=~{default=8 cpu}
486        export OMP_NUM_THREADS=~{default=8 cpu}
487
488        gatk --java-options "-Xmx~{command_mem_mb}m"  DetermineGermlineContigPloidy \
489            ~{"-L " + intervals} \
490            --input ~{sep=" --input " read_count_files} \
491            --contig-ploidy-priors ~{contig_ploidy_priors} \
492            --interval-merging-rule OVERLAPPING_ONLY \
493            --output ~{output_dir_} \
494            --output-prefix ~{cohort_entity_id} \
495            --verbosity DEBUG \
496            --mean-bias-standard-deviation ~{default="0.01" mean_bias_standard_deviation} \
497            --mapping-error-rate ~{default="0.01" mapping_error_rate} \
498            --global-psi-scale ~{default="0.001" global_psi_scale} \
499            --sample-psi-scale ~{default="0.0001" sample_psi_scale}
500
501        tar czf ~{cohort_entity_id}-contig-ploidy-model.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model .
502        tar czf ~{cohort_entity_id}-contig-ploidy-calls.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls .
503    >>>
504
505    runtime {
506        docker: gatk_docker
507        memory: machine_mem_mb + " MB"
508        disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD"
509        cpu: select_first([cpu, 8])
510        preemptible: select_first([preemptible_attempts, 2])
511    }
512
513    output {
514        File contig_ploidy_model_tar = "~{cohort_entity_id}-contig-ploidy-model.tar.gz"
515        File contig_ploidy_calls_tar = "~{cohort_entity_id}-contig-ploidy-calls.tar.gz"
516    }
517}
518
519task GermlineCNVCallerCohortMode {
520    input {
521      Int scatter_index
522      String cohort_entity_id
523      Array[File] read_count_files
524      File contig_ploidy_calls_tar
525      File intervals
526      File? annotated_intervals
527      String? output_dir
528      File? gatk4_jar_override
529
530      # Runtime parameters
531      String gatk_docker
532      Int? mem_gb
533      Int? disk_space_gb
534      Boolean use_ssd = false
535      Int? cpu
536      Int? preemptible_attempts
537
538      # Caller parameters
539      Float? p_alt
540      Float? p_active
541      Float? cnv_coherence_length
542      Float? class_coherence_length
543      Int? max_copy_number
544
545      # Denoising model parameters
546      Int? max_bias_factors
547      Float? mapping_error_rate
548      Float? interval_psi_scale
549      Float? sample_psi_scale
550      Float? depth_correction_tau
551      Float? log_mean_bias_standard_deviation
552      Float? init_ard_rel_unexplained_variance
553      Int? num_gc_bins
554      Float? gc_curve_standard_deviation
555      String? copy_number_posterior_expectation_mode
556      Boolean? enable_bias_factors
557      Int? active_class_padding_hybrid_mode
558
559      # Hybrid ADVI parameters
560      Float? learning_rate
561      Float? adamax_beta_1
562      Float? adamax_beta_2
563      Int? log_emission_samples_per_round
564      Float? log_emission_sampling_median_rel_error
565      Int? log_emission_sampling_rounds
566      Int? max_advi_iter_first_epoch
567      Int? max_advi_iter_subsequent_epochs
568      Int? min_training_epochs
569      Int? max_training_epochs
570      Float? initial_temperature
571      Int? num_thermal_advi_iters
572      Int? convergence_snr_averaging_window
573      Float? convergence_snr_trigger_threshold
574      Int? convergence_snr_countdown_window
575      Int? max_calling_iters
576      Float? caller_update_convergence_threshold
577      Float? caller_internal_admixing_rate
578      Float? caller_external_admixing_rate
579      Boolean? disable_annealing
580    }
581
582    Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
583    Int command_mem_mb = machine_mem_mb - 500
584
585    # If optional output_dir not specified, use "out"
586    String output_dir_ = select_first([output_dir, "out"])
587    Int num_samples = length(read_count_files)
588
589    String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819
590
591    command <<<
592        set -eu
593        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}
594        export MKL_NUM_THREADS=~{default=8 cpu}
595        export OMP_NUM_THREADS=~{default=8 cpu}
596
597        mkdir contig-ploidy-calls
598        tar xzf ~{contig_ploidy_calls_tar} -C contig-ploidy-calls
599
600        gatk --java-options "-Xmx~{command_mem_mb}m"  GermlineCNVCaller \
601            --run-mode COHORT \
602            -L ~{intervals} \
603            --input ~{sep=" --input " read_count_files} \
604            --contig-ploidy-calls contig-ploidy-calls \
605            ~{"--annotated-intervals " + annotated_intervals} \
606            --interval-merging-rule OVERLAPPING_ONLY \
607            --output ~{output_dir_} \
608            --output-prefix ~{cohort_entity_id} \
609            --verbosity DEBUG \
610            --p-alt ~{default="1e-6" p_alt} \
611            --p-active ~{default="1e-2" p_active} \
612            --cnv-coherence-length ~{default="10000.0" cnv_coherence_length} \
613            --class-coherence-length ~{default="10000.0" class_coherence_length} \
614            --max-copy-number ~{default="5" max_copy_number} \
615            --max-bias-factors ~{default="5" max_bias_factors} \
616            --mapping-error-rate ~{default="0.01" mapping_error_rate} \
617            --interval-psi-scale ~{default="0.001" interval_psi_scale} \
618            --sample-psi-scale ~{default="0.0001" sample_psi_scale} \
619            --depth-correction-tau ~{default="10000.0" depth_correction_tau} \
620            --log-mean-bias-standard-deviation ~{default="0.1" log_mean_bias_standard_deviation} \
621            --init-ard-rel-unexplained-variance ~{default="0.1" init_ard_rel_unexplained_variance} \
622            --num-gc-bins ~{default="20" num_gc_bins} \
623            --gc-curve-standard-deviation ~{default="1.0" gc_curve_standard_deviation} \
624            --copy-number-posterior-expectation-mode ~{default="HYBRID" copy_number_posterior_expectation_mode} \
625            --enable-bias-factors ~{default="true" enable_bias_factors} \
626            --active-class-padding-hybrid-mode ~{default="50000" active_class_padding_hybrid_mode} \
627            --learning-rate ~{default="0.05" learning_rate} \
628            --adamax-beta-1 ~{default="0.9" adamax_beta_1} \
629            --adamax-beta-2 ~{default="0.99" adamax_beta_2} \
630            --log-emission-samples-per-round ~{default="50" log_emission_samples_per_round} \
631            --log-emission-sampling-median-rel-error ~{default="0.005" log_emission_sampling_median_rel_error} \
632            --log-emission-sampling-rounds ~{default="10" log_emission_sampling_rounds} \
633            --max-advi-iter-first-epoch ~{default="5000" max_advi_iter_first_epoch} \
634            --max-advi-iter-subsequent-epochs ~{default="100" max_advi_iter_subsequent_epochs} \
635            --min-training-epochs ~{default="10" min_training_epochs} \
636            --max-training-epochs ~{default="100" max_training_epochs} \
637            --initial-temperature ~{default="2.0" initial_temperature} \
638            --num-thermal-advi-iters ~{default="2500" num_thermal_advi_iters} \
639            --convergence-snr-averaging-window ~{default="500" convergence_snr_averaging_window} \
640            --convergence-snr-trigger-threshold ~{default="0.1" convergence_snr_trigger_threshold} \
641            --convergence-snr-countdown-window ~{default="10" convergence_snr_countdown_window} \
642            --max-calling-iters ~{default="10" max_calling_iters} \
643            --caller-update-convergence-threshold ~{default="0.001" caller_update_convergence_threshold} \
644            --caller-internal-admixing-rate ~{default="0.75" caller_internal_admixing_rate} \
645            --caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \
646            --disable-annealing ~{default="false" disable_annealing}
647
648        tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model .
649        tar czf ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-tracking .
650
651        CURRENT_SAMPLE=0
652        NUM_SAMPLES=~{num_samples}
653        NUM_DIGITS=${#NUM_SAMPLES}
654        while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
655            CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE)
656            tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE .
657            let CURRENT_SAMPLE=CURRENT_SAMPLE+1
658        done
659
660        rm -rf contig-ploidy-calls
661    >>>
662
663    runtime {
664        docker: gatk_docker
665        memory: machine_mem_mb + " MB"
666        disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD"
667        cpu: select_first([cpu, 8])
668        preemptible: select_first([preemptible_attempts, 2])
669    }
670
671    output {
672        File gcnv_model_tar = "~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz"
673        Array[File] gcnv_call_tars = glob("~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz")
674        File gcnv_tracking_tar = "~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz"
675        File calling_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/calling_config.json"
676        File denoising_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/denoising_config.json"
677        File gcnvkernel_version_json = "~{output_dir_}/~{cohort_entity_id}-calls/gcnvkernel_version.json"
678        File sharded_interval_list = "~{output_dir_}/~{cohort_entity_id}-calls/interval_list.tsv"
679    }
680}
681
682task WritePathList {
683    input {
684    	Array[String] file_paths
685        String outfile
686
687        # Runtime parameters
688        String docker = "python:latest"
689        Int machine_mem_gb = 7
690        Int disk_space_gb = 100
691        Int preemptible_attempts = 3
692    }
693
694    command <<<
695    set -oe pipefail
696
697    python << CODE
698    file_paths = ['~{sep="','" file_paths}']
699
700    with open("path_list.txt", "w") as fi:
701      for i in range(len(file_paths)):
702        fi.write(file_paths[i] + "\n")
703
704    CODE
705    mv path_list.txt ~{outfile}
706    >>>
707
708    runtime {
709      docker: docker
710      memory: machine_mem_gb + " GB"
711      disks: "local-disk " + disk_space_gb + " HDD"
712      preemptible: 3
713    }
714
715    output {
716        File path_list = outfile
717    }
718}
719
720task WritePathMatrix {
721    input {
722        Array[Array[String]] path_matrix
723        String outfile
724    }
725
726    # Runtime parameters
727    String docker = "python:latest"
728    Int machine_mem_gb = 7
729    Int disk_space_gb = 100
730    Int preemptible_attempts = 3
731
732    command<<<
733        mv ~{write_tsv(path_matrix)} ~{outfile}
734    >>>
735
736    runtime {
737      docker: docker
738      memory: machine_mem_gb + " GB"
739      disks: "local-disk " + disk_space_gb + " HDD"
740      preemptible: 3
741    }
742
743    output {
744        File path_list = outfile
745    }
746}
747