1# Workflow for creating a GATK GermlineCNVCaller denoising model and generating calls given a list of normal samples. Supports both WGS and WES. 2# 3# Notes: 4# 5# - The intervals argument is required for both WGS and WES workflows and accepts formats compatible with the 6# GATK -L argument (see https://gatkforums.broadinstitute.org/gatk/discussion/11009/intervals-and-interval-lists). 7# These intervals will be padded on both sides by the amount specified by padding (default 250) 8# and split into bins of length specified by bin_length (default 1000; specify 0 to skip binning, 9# e.g., for WES). For WGS, the intervals should simply cover the chromosomes of interest. 10# 11# - Intervals can be blacklisted from coverage collection and all downstream steps by using the blacklist_intervals 12# argument, which accepts formats compatible with the GATK -XL argument 13# (see https://gatkforums.broadinstitute.org/gatk/discussion/11009/intervals-and-interval-lists). 14# This may be useful for excluding centromeric regions, etc. from analysis. Alternatively, these regions may 15# be manually filtered from the final callset. 16# 17# - Example invocation: 18# 19# java -jar cromwell.jar run cnv_germline_cohort_workflow.wdl -i my_parameters.json 20# 21############# 22 23version 1.0 24 25import "../cnv_common_tasks.wdl" as CNVTasks 26 27workflow CNVGermlineCohortWorkflow { 28 29 input { 30 ################################## 31 #### required basic arguments #### 32 ################################## 33 File intervals 34 File? blacklist_intervals 35 Array[String]+ normal_bams 36 Array[String]+ normal_bais 37 String cohort_entity_id 38 File contig_ploidy_priors 39 Int num_intervals_per_scatter 40 File ref_fasta_dict 41 File ref_fasta_fai 42 File ref_fasta 43 String gatk_docker 44 45 ################################## 46 #### optional basic arguments #### 47 ################################## 48 # If true, AnnotateIntervals will be run to create GC annotations and explicit 49 # GC correction will be performed by the model generated by 50 Boolean? do_explicit_gc_correction 51 File? gatk4_jar_override 52 Int? preemptible_attempts 53 54 # Required if BAM/CRAM is in a requester pays bucket 55 String? gcs_project_for_requester_pays 56 57 #################################################### 58 #### optional arguments for PreprocessIntervals #### 59 #################################################### 60 Int? padding 61 Int? bin_length 62 63 ################################################## 64 #### optional arguments for AnnotateIntervals #### 65 ################################################## 66 File? mappability_track_bed 67 File? mappability_track_bed_idx 68 File? segmental_duplication_track_bed 69 File? segmental_duplication_track_bed_idx 70 Int? feature_query_lookahead 71 Int? mem_gb_for_annotate_intervals 72 73 ################################################# 74 #### optional arguments for FilterIntervals #### 75 ################################################ 76 File? blacklist_intervals_for_filter_intervals 77 Float? minimum_gc_content 78 Float? maximum_gc_content 79 Float? minimum_mappability 80 Float? maximum_mappability 81 Float? minimum_segmental_duplication_content 82 Float? maximum_segmental_duplication_content 83 Int? low_count_filter_count_threshold 84 Float? low_count_filter_percentage_of_samples 85 Float? extreme_count_filter_minimum_percentile 86 Float? extreme_count_filter_maximum_percentile 87 Float? extreme_count_filter_percentage_of_samples 88 Int? mem_gb_for_filter_intervals 89 90 ############################################## 91 #### optional arguments for CollectCounts #### 92 ############################################## 93 Array[String]? disabled_read_filters_for_collect_counts 94 String? collect_counts_format 95 Boolean? collect_counts_enable_indexing 96 Int? mem_gb_for_collect_counts 97 98 ######################################################################## 99 #### optional arguments for DetermineGermlineContigPloidyCohortMode #### 100 ######################################################################## 101 Float? ploidy_mean_bias_standard_deviation 102 Float? ploidy_mapping_error_rate 103 Float? ploidy_global_psi_scale 104 Float? ploidy_sample_psi_scale 105 Int? mem_gb_for_determine_germline_contig_ploidy 106 Int? cpu_for_determine_germline_contig_ploidy 107 108 ############################################################ 109 #### optional arguments for GermlineCNVCallerCohortMode #### 110 ############################################################ 111 Float? gcnv_p_alt 112 Float? gcnv_p_active 113 Float? gcnv_cnv_coherence_length 114 Float? gcnv_class_coherence_length 115 Int? gcnv_max_copy_number 116 Int? mem_gb_for_germline_cnv_caller 117 Int? cpu_for_germline_cnv_caller 118 119 # optional arguments for germline CNV denoising model 120 Int? gcnv_max_bias_factors 121 Float? gcnv_mapping_error_rate 122 Float? gcnv_interval_psi_scale 123 Float? gcnv_sample_psi_scale 124 Float? gcnv_depth_correction_tau 125 Float? gcnv_log_mean_bias_standard_deviation 126 Float? gcnv_init_ard_rel_unexplained_variance 127 Int? gcnv_num_gc_bins 128 Float? gcnv_gc_curve_standard_deviation 129 String? gcnv_copy_number_posterior_expectation_mode 130 Boolean? gcnv_enable_bias_factors 131 Int? gcnv_active_class_padding_hybrid_mode 132 133 # optional arguments for Hybrid ADVI 134 Float? gcnv_learning_rate 135 Float? gcnv_adamax_beta_1 136 Float? gcnv_adamax_beta_2 137 Int? gcnv_log_emission_samples_per_round 138 Float? gcnv_log_emission_sampling_median_rel_error 139 Int? gcnv_log_emission_sampling_rounds 140 Int? gcnv_max_advi_iter_first_epoch 141 Int? gcnv_max_advi_iter_subsequent_epochs 142 Int? gcnv_min_training_epochs 143 Int? gcnv_max_training_epochs 144 Float? gcnv_initial_temperature 145 Int? gcnv_num_thermal_advi_iters 146 Int? gcnv_convergence_snr_averaging_window 147 Float? gcnv_convergence_snr_trigger_threshold 148 Int? gcnv_convergence_snr_countdown_window 149 Int? gcnv_max_calling_iters 150 Float? gcnv_caller_update_convergence_threshold 151 Float? gcnv_caller_internal_admixing_rate 152 Float? gcnv_caller_external_admixing_rate 153 Boolean? gcnv_disable_annealing 154 155 ################################################### 156 #### arguments for PostprocessGermlineCNVCalls #### 157 ################################################### 158 Int ref_copy_number_autosomal_contigs 159 Int? mem_gb_for_postprocess_germline_cnv_calls 160 Int? disk_space_gb_for_postprocess_germline_cnv_calls 161 Array[String]? allosomal_contigs 162 163 ########################## 164 #### arguments for QC #### 165 ########################## 166 Int maximum_number_events_per_sample 167 Int maximum_number_pass_events_per_sample 168 } 169 170 Array[Pair[String, String]] normal_bams_and_bais = zip(normal_bams, normal_bais) 171 172 call CNVTasks.PreprocessIntervals { 173 input: 174 intervals = intervals, 175 blacklist_intervals = blacklist_intervals, 176 ref_fasta = ref_fasta, 177 ref_fasta_fai = ref_fasta_fai, 178 ref_fasta_dict = ref_fasta_dict, 179 padding = padding, 180 bin_length = bin_length, 181 gatk4_jar_override = gatk4_jar_override, 182 gatk_docker = gatk_docker, 183 preemptible_attempts = preemptible_attempts 184 } 185 186 if (select_first([do_explicit_gc_correction, true])) { 187 call CNVTasks.AnnotateIntervals { 188 input: 189 intervals = PreprocessIntervals.preprocessed_intervals, 190 ref_fasta = ref_fasta, 191 ref_fasta_fai = ref_fasta_fai, 192 ref_fasta_dict = ref_fasta_dict, 193 mappability_track_bed = mappability_track_bed, 194 mappability_track_bed_idx = mappability_track_bed_idx, 195 segmental_duplication_track_bed = segmental_duplication_track_bed, 196 segmental_duplication_track_bed_idx = segmental_duplication_track_bed_idx, 197 feature_query_lookahead = feature_query_lookahead, 198 gatk4_jar_override = gatk4_jar_override, 199 gatk_docker = gatk_docker, 200 mem_gb = mem_gb_for_annotate_intervals, 201 preemptible_attempts = preemptible_attempts 202 } 203 } 204 205 scatter (normal_bam_and_bai in normal_bams_and_bais) { 206 call CNVTasks.CollectCounts { 207 input: 208 intervals = PreprocessIntervals.preprocessed_intervals, 209 bam = normal_bam_and_bai.left, 210 bam_idx = normal_bam_and_bai.right, 211 ref_fasta = ref_fasta, 212 ref_fasta_fai = ref_fasta_fai, 213 ref_fasta_dict = ref_fasta_dict, 214 format = collect_counts_format, 215 enable_indexing = collect_counts_enable_indexing, 216 disabled_read_filters = disabled_read_filters_for_collect_counts, 217 gatk4_jar_override = gatk4_jar_override, 218 gatk_docker = gatk_docker, 219 mem_gb = mem_gb_for_collect_counts, 220 preemptible_attempts = preemptible_attempts, 221 gcs_project_for_requester_pays = gcs_project_for_requester_pays 222 } 223 } 224 225 call CNVTasks.FilterIntervals { 226 input: 227 intervals = PreprocessIntervals.preprocessed_intervals, 228 blacklist_intervals = blacklist_intervals_for_filter_intervals, 229 annotated_intervals = AnnotateIntervals.annotated_intervals, 230 read_count_files = CollectCounts.counts, 231 minimum_gc_content = minimum_gc_content, 232 maximum_gc_content = maximum_gc_content, 233 minimum_mappability = minimum_mappability, 234 maximum_mappability = maximum_mappability, 235 minimum_segmental_duplication_content = minimum_segmental_duplication_content, 236 maximum_segmental_duplication_content = maximum_segmental_duplication_content, 237 low_count_filter_count_threshold = low_count_filter_count_threshold, 238 low_count_filter_percentage_of_samples = low_count_filter_percentage_of_samples, 239 extreme_count_filter_minimum_percentile = extreme_count_filter_minimum_percentile, 240 extreme_count_filter_maximum_percentile = extreme_count_filter_maximum_percentile, 241 extreme_count_filter_percentage_of_samples = extreme_count_filter_percentage_of_samples, 242 gatk4_jar_override = gatk4_jar_override, 243 gatk_docker = gatk_docker, 244 mem_gb = mem_gb_for_filter_intervals, 245 preemptible_attempts = preemptible_attempts 246 } 247 248 call DetermineGermlineContigPloidyCohortMode { 249 input: 250 cohort_entity_id = cohort_entity_id, 251 intervals = FilterIntervals.filtered_intervals, 252 read_count_files = CollectCounts.counts, 253 contig_ploidy_priors = contig_ploidy_priors, 254 gatk4_jar_override = gatk4_jar_override, 255 gatk_docker = gatk_docker, 256 mem_gb = mem_gb_for_determine_germline_contig_ploidy, 257 cpu = cpu_for_determine_germline_contig_ploidy, 258 mean_bias_standard_deviation = ploidy_mean_bias_standard_deviation, 259 mapping_error_rate = ploidy_mapping_error_rate, 260 global_psi_scale = ploidy_global_psi_scale, 261 sample_psi_scale = ploidy_sample_psi_scale, 262 preemptible_attempts = preemptible_attempts 263 } 264 265 call CNVTasks.ScatterIntervals { 266 input: 267 interval_list = FilterIntervals.filtered_intervals, 268 num_intervals_per_scatter = num_intervals_per_scatter, 269 gatk_docker = gatk_docker, 270 preemptible_attempts = preemptible_attempts 271 } 272 273 scatter (scatter_index in range(length(ScatterIntervals.scattered_interval_lists))) { 274 call GermlineCNVCallerCohortMode { 275 input: 276 scatter_index = scatter_index, 277 cohort_entity_id = cohort_entity_id, 278 read_count_files = CollectCounts.counts, 279 contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, 280 intervals = ScatterIntervals.scattered_interval_lists[scatter_index], 281 annotated_intervals = AnnotateIntervals.annotated_intervals, 282 gatk4_jar_override = gatk4_jar_override, 283 gatk_docker = gatk_docker, 284 mem_gb = mem_gb_for_germline_cnv_caller, 285 cpu = cpu_for_germline_cnv_caller, 286 p_alt = gcnv_p_alt, 287 p_active = gcnv_p_active, 288 cnv_coherence_length = gcnv_cnv_coherence_length, 289 class_coherence_length = gcnv_class_coherence_length, 290 max_copy_number = gcnv_max_copy_number, 291 max_bias_factors = gcnv_max_bias_factors, 292 mapping_error_rate = gcnv_mapping_error_rate, 293 interval_psi_scale = gcnv_interval_psi_scale, 294 sample_psi_scale = gcnv_sample_psi_scale, 295 depth_correction_tau = gcnv_depth_correction_tau, 296 log_mean_bias_standard_deviation = gcnv_log_mean_bias_standard_deviation, 297 init_ard_rel_unexplained_variance = gcnv_init_ard_rel_unexplained_variance, 298 num_gc_bins = gcnv_num_gc_bins, 299 gc_curve_standard_deviation = gcnv_gc_curve_standard_deviation, 300 copy_number_posterior_expectation_mode = gcnv_copy_number_posterior_expectation_mode, 301 enable_bias_factors = gcnv_enable_bias_factors, 302 active_class_padding_hybrid_mode = gcnv_active_class_padding_hybrid_mode, 303 learning_rate = gcnv_learning_rate, 304 adamax_beta_1 = gcnv_adamax_beta_1, 305 adamax_beta_2 = gcnv_adamax_beta_2, 306 log_emission_samples_per_round = gcnv_log_emission_samples_per_round, 307 log_emission_sampling_median_rel_error = gcnv_log_emission_sampling_median_rel_error, 308 log_emission_sampling_rounds = gcnv_log_emission_sampling_rounds, 309 max_advi_iter_first_epoch = gcnv_max_advi_iter_first_epoch, 310 max_advi_iter_subsequent_epochs = gcnv_max_advi_iter_subsequent_epochs, 311 min_training_epochs = gcnv_min_training_epochs, 312 max_training_epochs = gcnv_max_training_epochs, 313 initial_temperature = gcnv_initial_temperature, 314 num_thermal_advi_iters = gcnv_num_thermal_advi_iters, 315 convergence_snr_averaging_window = gcnv_convergence_snr_averaging_window, 316 convergence_snr_trigger_threshold = gcnv_convergence_snr_trigger_threshold, 317 convergence_snr_countdown_window = gcnv_convergence_snr_countdown_window, 318 max_calling_iters = gcnv_max_calling_iters, 319 caller_update_convergence_threshold = gcnv_caller_update_convergence_threshold, 320 caller_internal_admixing_rate = gcnv_caller_internal_admixing_rate, 321 caller_external_admixing_rate = gcnv_caller_external_admixing_rate, 322 disable_annealing = gcnv_disable_annealing, 323 preemptible_attempts = preemptible_attempts 324 } 325 } 326 327 Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars) 328 329 scatter (sample_index in range(length(CollectCounts.entity_id))) { 330 call CNVTasks.PostprocessGermlineCNVCalls { 331 input: 332 entity_id = CollectCounts.entity_id[sample_index], 333 gcnv_calls_tars = call_tars_sample_by_shard[sample_index], 334 gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, 335 calling_configs = GermlineCNVCallerCohortMode.calling_config_json, 336 denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, 337 gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, 338 sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, 339 contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, 340 allosomal_contigs = allosomal_contigs, 341 ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, 342 sample_index = sample_index, 343 gatk4_jar_override = gatk4_jar_override, 344 gatk_docker = gatk_docker, 345 preemptible_attempts = preemptible_attempts 346 } 347 348 call CNVTasks.CollectSampleQualityMetrics { 349 input: 350 genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf, 351 entity_id = CollectCounts.entity_id[sample_index], 352 maximum_number_events = maximum_number_events_per_sample, 353 maximum_number_pass_events = maximum_number_pass_events_per_sample, 354 bash_docker = gatk_docker, 355 preemptible_attempts = preemptible_attempts 356 } 357 } 358 359 call CNVTasks.CollectModelQualityMetrics { 360 input: 361 gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, 362 gatk_docker = gatk_docker, 363 preemptible_attempts = preemptible_attempts 364 } 365 366 call CNVTasks.ScatterPloidyCallsBySample { 367 input : 368 contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, 369 samples = CollectCounts.entity_id, 370 docker = gatk_docker, 371 preemptible_attempts = preemptible_attempts 372 } 373 374 call WritePathList as WritePloidyCalls { 375 input: 376 file_paths = [DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar], 377 outfile = "contig_ploidy_calls_tar.paths.list" 378 } 379 380 call WritePathMatrix as WriteGCNVCalls { 381 input: 382 path_matrix = GermlineCNVCallerCohortMode.gcnv_call_tars, 383 outfile = "gcnv_call_tars.paths.list" 384 } 385 386 call WritePathList as WriteSegments { 387 input: 388 file_paths = PostprocessGermlineCNVCalls.genotyped_segments_vcf, 389 outfile = "genotyped_segments_vcf.paths.list" 390 } 391 392 call WritePathList as WriteSegmentIndexes { 393 input: 394 file_paths = PostprocessGermlineCNVCalls.genotyped_segments_vcf_index, 395 outfile = "genotyped_segments_vcf_index.paths.list" 396 } 397 398 call WritePathList as WriteIntervals { 399 input: 400 file_paths = PostprocessGermlineCNVCalls.genotyped_intervals_vcf, 401 outfile = "genotyped_intervals_vcf.paths.list" 402 } 403 404 call WritePathList as WriteIntervalIndexes { 405 input: 406 file_paths = PostprocessGermlineCNVCalls.genotyped_intervals_vcf_index, 407 outfile = "genotyped_intervals_vcf_index.paths.list" 408 } 409 410 411 output { 412 File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals 413 Array[File] read_counts_entity_ids = CollectCounts.entity_id 414 Array[File] read_counts = CollectCounts.counts 415 File? annotated_intervals = AnnotateIntervals.annotated_intervals 416 File filtered_intervals = FilterIntervals.filtered_intervals 417 File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar 418 File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar 419 File contig_ploidy_calls_tar_path_list = WritePloidyCalls.path_list 420 Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar 421 Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar 422 Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars 423 File gcnv_calls_tars_path_list = WriteGCNVCalls.path_list 424 Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar 425 426 Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf 427 File genotyped_intervals_vcfs_path_list = WriteIntervals.path_list 428 Array[File] genotyped_intervals_vcf_indexes = PostprocessGermlineCNVCalls.genotyped_intervals_vcf_index 429 File genotyped_intervals_vcf_indexes_path_list = WriteIntervalIndexes.path_list 430 Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf 431 File genotyped_segments_vcfs_path_list = WriteSegments.path_list 432 Array[File] genotyped_segments_vcf_indexes = PostprocessGermlineCNVCalls.genotyped_segments_vcf_index 433 File genotyped_segments_vcf_indexes_path_list = WriteSegmentIndexes.path_list 434 435 Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios 436 Array[File] sample_qc_status_files = CollectSampleQualityMetrics.qc_status_file 437 Array[String] sample_qc_status_strings = CollectSampleQualityMetrics.qc_status_string 438 File model_qc_status_file = CollectModelQualityMetrics.qc_status_file 439 String model_qc_string = CollectModelQualityMetrics.qc_status_string 440 Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios 441 442 Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar 443 Array[File] calling_configs = GermlineCNVCallerCohortMode.calling_config_json 444 Array[File] denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json 445 Array[File] gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json 446 Array[File] sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list 447 } 448} 449 450task DetermineGermlineContigPloidyCohortMode { 451 input { 452 String cohort_entity_id 453 File? intervals 454 Array[File] read_count_files 455 File contig_ploidy_priors 456 String? output_dir 457 File? gatk4_jar_override 458 459 # Runtime parameters 460 String gatk_docker 461 Int? mem_gb 462 Int? disk_space_gb 463 Boolean use_ssd = false 464 Int? cpu 465 Int? preemptible_attempts 466 467 # Model parameters 468 Float? mean_bias_standard_deviation 469 Float? mapping_error_rate 470 Float? global_psi_scale 471 Float? sample_psi_scale 472 } 473 474 # We do not expose Hybrid ADVI parameters -- the default values are decent 475 476 Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 477 Int command_mem_mb = machine_mem_mb - 500 478 479 # If optional output_dir not specified, use "out" 480 String output_dir_ = select_first([output_dir, "out"]) 481 482 command <<< 483 set -eu 484 export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} 485 export MKL_NUM_THREADS=~{default=8 cpu} 486 export OMP_NUM_THREADS=~{default=8 cpu} 487 488 gatk --java-options "-Xmx~{command_mem_mb}m" DetermineGermlineContigPloidy \ 489 ~{"-L " + intervals} \ 490 --input ~{sep=" --input " read_count_files} \ 491 --contig-ploidy-priors ~{contig_ploidy_priors} \ 492 --interval-merging-rule OVERLAPPING_ONLY \ 493 --output ~{output_dir_} \ 494 --output-prefix ~{cohort_entity_id} \ 495 --verbosity DEBUG \ 496 --mean-bias-standard-deviation ~{default="0.01" mean_bias_standard_deviation} \ 497 --mapping-error-rate ~{default="0.01" mapping_error_rate} \ 498 --global-psi-scale ~{default="0.001" global_psi_scale} \ 499 --sample-psi-scale ~{default="0.0001" sample_psi_scale} 500 501 tar czf ~{cohort_entity_id}-contig-ploidy-model.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model . 502 tar czf ~{cohort_entity_id}-contig-ploidy-calls.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls . 503 >>> 504 505 runtime { 506 docker: gatk_docker 507 memory: machine_mem_mb + " MB" 508 disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD" 509 cpu: select_first([cpu, 8]) 510 preemptible: select_first([preemptible_attempts, 2]) 511 } 512 513 output { 514 File contig_ploidy_model_tar = "~{cohort_entity_id}-contig-ploidy-model.tar.gz" 515 File contig_ploidy_calls_tar = "~{cohort_entity_id}-contig-ploidy-calls.tar.gz" 516 } 517} 518 519task GermlineCNVCallerCohortMode { 520 input { 521 Int scatter_index 522 String cohort_entity_id 523 Array[File] read_count_files 524 File contig_ploidy_calls_tar 525 File intervals 526 File? annotated_intervals 527 String? output_dir 528 File? gatk4_jar_override 529 530 # Runtime parameters 531 String gatk_docker 532 Int? mem_gb 533 Int? disk_space_gb 534 Boolean use_ssd = false 535 Int? cpu 536 Int? preemptible_attempts 537 538 # Caller parameters 539 Float? p_alt 540 Float? p_active 541 Float? cnv_coherence_length 542 Float? class_coherence_length 543 Int? max_copy_number 544 545 # Denoising model parameters 546 Int? max_bias_factors 547 Float? mapping_error_rate 548 Float? interval_psi_scale 549 Float? sample_psi_scale 550 Float? depth_correction_tau 551 Float? log_mean_bias_standard_deviation 552 Float? init_ard_rel_unexplained_variance 553 Int? num_gc_bins 554 Float? gc_curve_standard_deviation 555 String? copy_number_posterior_expectation_mode 556 Boolean? enable_bias_factors 557 Int? active_class_padding_hybrid_mode 558 559 # Hybrid ADVI parameters 560 Float? learning_rate 561 Float? adamax_beta_1 562 Float? adamax_beta_2 563 Int? log_emission_samples_per_round 564 Float? log_emission_sampling_median_rel_error 565 Int? log_emission_sampling_rounds 566 Int? max_advi_iter_first_epoch 567 Int? max_advi_iter_subsequent_epochs 568 Int? min_training_epochs 569 Int? max_training_epochs 570 Float? initial_temperature 571 Int? num_thermal_advi_iters 572 Int? convergence_snr_averaging_window 573 Float? convergence_snr_trigger_threshold 574 Int? convergence_snr_countdown_window 575 Int? max_calling_iters 576 Float? caller_update_convergence_threshold 577 Float? caller_internal_admixing_rate 578 Float? caller_external_admixing_rate 579 Boolean? disable_annealing 580 } 581 582 Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 583 Int command_mem_mb = machine_mem_mb - 500 584 585 # If optional output_dir not specified, use "out" 586 String output_dir_ = select_first([output_dir, "out"]) 587 Int num_samples = length(read_count_files) 588 589 String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819 590 591 command <<< 592 set -eu 593 export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} 594 export MKL_NUM_THREADS=~{default=8 cpu} 595 export OMP_NUM_THREADS=~{default=8 cpu} 596 597 mkdir contig-ploidy-calls 598 tar xzf ~{contig_ploidy_calls_tar} -C contig-ploidy-calls 599 600 gatk --java-options "-Xmx~{command_mem_mb}m" GermlineCNVCaller \ 601 --run-mode COHORT \ 602 -L ~{intervals} \ 603 --input ~{sep=" --input " read_count_files} \ 604 --contig-ploidy-calls contig-ploidy-calls \ 605 ~{"--annotated-intervals " + annotated_intervals} \ 606 --interval-merging-rule OVERLAPPING_ONLY \ 607 --output ~{output_dir_} \ 608 --output-prefix ~{cohort_entity_id} \ 609 --verbosity DEBUG \ 610 --p-alt ~{default="1e-6" p_alt} \ 611 --p-active ~{default="1e-2" p_active} \ 612 --cnv-coherence-length ~{default="10000.0" cnv_coherence_length} \ 613 --class-coherence-length ~{default="10000.0" class_coherence_length} \ 614 --max-copy-number ~{default="5" max_copy_number} \ 615 --max-bias-factors ~{default="5" max_bias_factors} \ 616 --mapping-error-rate ~{default="0.01" mapping_error_rate} \ 617 --interval-psi-scale ~{default="0.001" interval_psi_scale} \ 618 --sample-psi-scale ~{default="0.0001" sample_psi_scale} \ 619 --depth-correction-tau ~{default="10000.0" depth_correction_tau} \ 620 --log-mean-bias-standard-deviation ~{default="0.1" log_mean_bias_standard_deviation} \ 621 --init-ard-rel-unexplained-variance ~{default="0.1" init_ard_rel_unexplained_variance} \ 622 --num-gc-bins ~{default="20" num_gc_bins} \ 623 --gc-curve-standard-deviation ~{default="1.0" gc_curve_standard_deviation} \ 624 --copy-number-posterior-expectation-mode ~{default="HYBRID" copy_number_posterior_expectation_mode} \ 625 --enable-bias-factors ~{default="true" enable_bias_factors} \ 626 --active-class-padding-hybrid-mode ~{default="50000" active_class_padding_hybrid_mode} \ 627 --learning-rate ~{default="0.05" learning_rate} \ 628 --adamax-beta-1 ~{default="0.9" adamax_beta_1} \ 629 --adamax-beta-2 ~{default="0.99" adamax_beta_2} \ 630 --log-emission-samples-per-round ~{default="50" log_emission_samples_per_round} \ 631 --log-emission-sampling-median-rel-error ~{default="0.005" log_emission_sampling_median_rel_error} \ 632 --log-emission-sampling-rounds ~{default="10" log_emission_sampling_rounds} \ 633 --max-advi-iter-first-epoch ~{default="5000" max_advi_iter_first_epoch} \ 634 --max-advi-iter-subsequent-epochs ~{default="100" max_advi_iter_subsequent_epochs} \ 635 --min-training-epochs ~{default="10" min_training_epochs} \ 636 --max-training-epochs ~{default="100" max_training_epochs} \ 637 --initial-temperature ~{default="2.0" initial_temperature} \ 638 --num-thermal-advi-iters ~{default="2500" num_thermal_advi_iters} \ 639 --convergence-snr-averaging-window ~{default="500" convergence_snr_averaging_window} \ 640 --convergence-snr-trigger-threshold ~{default="0.1" convergence_snr_trigger_threshold} \ 641 --convergence-snr-countdown-window ~{default="10" convergence_snr_countdown_window} \ 642 --max-calling-iters ~{default="10" max_calling_iters} \ 643 --caller-update-convergence-threshold ~{default="0.001" caller_update_convergence_threshold} \ 644 --caller-internal-admixing-rate ~{default="0.75" caller_internal_admixing_rate} \ 645 --caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \ 646 --disable-annealing ~{default="false" disable_annealing} 647 648 tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model . 649 tar czf ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-tracking . 650 651 CURRENT_SAMPLE=0 652 NUM_SAMPLES=~{num_samples} 653 NUM_DIGITS=${#NUM_SAMPLES} 654 while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do 655 CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) 656 tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE . 657 let CURRENT_SAMPLE=CURRENT_SAMPLE+1 658 done 659 660 rm -rf contig-ploidy-calls 661 >>> 662 663 runtime { 664 docker: gatk_docker 665 memory: machine_mem_mb + " MB" 666 disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD" 667 cpu: select_first([cpu, 8]) 668 preemptible: select_first([preemptible_attempts, 2]) 669 } 670 671 output { 672 File gcnv_model_tar = "~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz" 673 Array[File] gcnv_call_tars = glob("~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz") 674 File gcnv_tracking_tar = "~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz" 675 File calling_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/calling_config.json" 676 File denoising_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/denoising_config.json" 677 File gcnvkernel_version_json = "~{output_dir_}/~{cohort_entity_id}-calls/gcnvkernel_version.json" 678 File sharded_interval_list = "~{output_dir_}/~{cohort_entity_id}-calls/interval_list.tsv" 679 } 680} 681 682task WritePathList { 683 input { 684 Array[String] file_paths 685 String outfile 686 687 # Runtime parameters 688 String docker = "python:latest" 689 Int machine_mem_gb = 7 690 Int disk_space_gb = 100 691 Int preemptible_attempts = 3 692 } 693 694 command <<< 695 set -oe pipefail 696 697 python << CODE 698 file_paths = ['~{sep="','" file_paths}'] 699 700 with open("path_list.txt", "w") as fi: 701 for i in range(len(file_paths)): 702 fi.write(file_paths[i] + "\n") 703 704 CODE 705 mv path_list.txt ~{outfile} 706 >>> 707 708 runtime { 709 docker: docker 710 memory: machine_mem_gb + " GB" 711 disks: "local-disk " + disk_space_gb + " HDD" 712 preemptible: 3 713 } 714 715 output { 716 File path_list = outfile 717 } 718} 719 720task WritePathMatrix { 721 input { 722 Array[Array[String]] path_matrix 723 String outfile 724 } 725 726 # Runtime parameters 727 String docker = "python:latest" 728 Int machine_mem_gb = 7 729 Int disk_space_gb = 100 730 Int preemptible_attempts = 3 731 732 command<<< 733 mv ~{write_tsv(path_matrix)} ~{outfile} 734 >>> 735 736 runtime { 737 docker: docker 738 memory: machine_mem_gb + " GB" 739 disks: "local-disk " + disk_space_gb + " HDD" 740 preemptible: 3 741 } 742 743 output { 744 File path_list = outfile 745 } 746} 747