1# Workflow for running the GATK CNV pipeline on a matched pair. Supports both WGS and WES.
2#
3# Notes:
4#
5# - The intervals argument is required for both WGS and WES workflows and accepts formats compatible with the
6#   GATK -L argument (see https://gatkforums.broadinstitute.org/gatk/discussion/11009/intervals-and-interval-lists).
7#   These intervals will be padded on both sides by the amount specified by padding (default 250)
8#   and split into bins of length specified by bin_length (default 1000; specify 0 to skip binning,
9#   e.g., for WES).  For WGS, the intervals should simply cover the autosomal chromosomes (sex chromosomes may be
10#   included, but care should be taken to 1) avoid creating panels of mixed sex, and 2) denoise case samples only
11#   with panels containing only individuals of the same sex as the case samples).
12#
13# - Intervals can be blacklisted from coverage collection and all downstream steps by using the blacklist_intervals
14#   argument, which accepts formats compatible with the GATK -XL argument
15#   (see https://gatkforums.broadinstitute.org/gatk/discussion/11009/intervals-and-interval-lists).
16#   This may be useful for excluding centromeric regions, etc. from analysis.  Alternatively, these regions may
17#   be manually filtered from the final callset.
18#
19#  A reasonable blacklist for excluded intervals (-XL) can be found at:
20#   hg19: gs://gatk-best-practices/somatic-b37/CNV_and_centromere_blacklist.hg19.list
21#   hg38: gs://gatk-best-practices/somatic-hg38/CNV_and_centromere_blacklist.hg38liftover.list (untested)
22#
23# - The sites file (common_sites) should be a Picard or GATK-style interval list.  This is a list of sites
24#   of known variation at which allelic counts will be collected for use in modeling minor-allele fractions.
25#
26# - If you opt to run FuncotateSegments (i.e. set `is_run_funcotator` to `true`), then please also ensure that you have
27#       the correct value for `funcotator_ref_version`.  Treat `funcotator_ref_version` as required if
28#       `is_run_funcotator` is `true`.  Valid values for `funcotator_ref_version` are `hg38` and `hg19`.
29#       The latter includes GRCh37.
30#
31#
32# - Example invocation:
33#
34#       java -jar cromwell.jar run cnv_somatic_pair_workflow.wdl -i my_parameters.json
35#
36#############
37
38version 1.0
39
40import "../cnv_common_tasks.wdl" as CNVTasks
41import "cnv_somatic_oncotator_workflow.wdl" as CNVOncotator
42import "cnv_somatic_funcotate_seg_workflow.wdl" as CNVFuncotateSegments
43
44workflow CNVSomaticPairWorkflow {
45
46    input {
47      ##################################
48      #### required basic arguments ####
49      ##################################
50      File common_sites
51      File intervals
52      File? blacklist_intervals
53      File tumor_bam
54      File tumor_bam_idx
55      File? normal_bam
56      File? normal_bam_idx
57      File read_count_pon
58      File ref_fasta_dict
59      File ref_fasta_fai
60      File ref_fasta
61      String gatk_docker
62
63      ##################################
64      #### optional basic arguments ####
65      ##################################
66       # For running oncotator
67      Boolean? is_run_oncotator
68       # For running funcotator
69      Boolean? is_run_funcotator
70
71      File? gatk4_jar_override
72      Int? preemptible_attempts
73      # Use as a last resort to increase the disk given to every task in case of ill behaving data
74      Int? emergency_extra_disk
75
76      # Required if BAM/CRAM is in a requester pays bucket
77      String? gcs_project_for_requester_pays
78
79      ####################################################
80      #### optional arguments for PreprocessIntervals ####
81      ####################################################
82      Int? padding
83      Int? bin_length
84      Int? mem_gb_for_preprocess_intervals
85
86      ##############################################
87      #### optional arguments for CollectCounts ####
88      ##############################################
89      String? collect_counts_format
90      Int? mem_gb_for_collect_counts
91
92      #####################################################
93      #### optional arguments for CollectAllelicCounts ####
94      #####################################################
95      String? minimum_base_quality
96      Int? mem_gb_for_collect_allelic_counts
97
98      ##################################################
99      #### optional arguments for DenoiseReadCounts ####
100      ##################################################
101      Int? number_of_eigensamples
102      Int? mem_gb_for_denoise_read_counts
103
104      ##############################################
105      #### optional arguments for ModelSegments ####
106      ##############################################
107      Int? max_num_segments_per_chromosome
108      Int? min_total_allele_count
109      Int? min_total_allele_count_normal
110      Float? genotyping_homozygous_log_ratio_threshold
111      Float? genotyping_base_error_rate
112      Float? kernel_variance_copy_ratio
113      Float? kernel_variance_allele_fraction
114      Float? kernel_scaling_allele_fraction
115      Int? kernel_approximation_dimension
116      Array[Int]+? window_sizes = [8, 16, 32, 64, 128, 256]
117      Float? num_changepoints_penalty_factor
118      Float? minor_allele_fraction_prior_alpha
119      Int? num_samples_copy_ratio
120      Int? num_burn_in_copy_ratio
121      Int? num_samples_allele_fraction
122      Int? num_burn_in_allele_fraction
123      Float? smoothing_threshold_copy_ratio
124      Float? smoothing_threshold_allele_fraction
125      Int? max_num_smoothing_iterations
126      Int? num_smoothing_iterations_per_fit
127      Int? mem_gb_for_model_segments
128
129      ######################################################
130      #### optional arguments for CallCopyRatioSegments ####
131      ######################################################
132      Float? neutral_segment_copy_ratio_lower_bound
133      Float? neutral_segment_copy_ratio_upper_bound
134      Float? outlier_neutral_segment_copy_ratio_z_score_threshold
135      Float? calling_copy_ratio_z_score_threshold
136      Int? mem_gb_for_call_copy_ratio_segments
137
138      #########################################
139      #### optional arguments for plotting ####
140      #########################################
141      Int? minimum_contig_length
142      # If maximum_copy_ratio = Infinity, the maximum copy ratio will be automatically determined
143      String? maximum_copy_ratio
144      Float? point_size_copy_ratio
145      Float? point_size_allele_fraction
146      Int? mem_gb_for_plotting
147
148      ##########################################
149      #### optional arguments for Oncotator ####
150      ##########################################
151      String? additional_args_for_oncotator
152      String? oncotator_docker
153      Int? mem_gb_for_oncotator
154      Int? boot_disk_space_gb_for_oncotator
155
156      ##################################################
157      #### optional arguments for FuncotateSegments ####
158      ##################################################
159      String? additional_args_for_funcotator
160      String? funcotator_ref_version
161      Int? mem_gb_for_funcotator
162      File? funcotator_transcript_selection_list
163      File? funcotator_data_sources_tar_gz
164      String? funcotator_transcript_selection_mode
165      Array[String]? funcotator_annotation_defaults
166      Array[String]? funcotator_annotation_overrides
167      Array[String]? funcotator_excluded_fields
168      Boolean? funcotator_is_removing_untared_datasources
169      Int? funcotator_disk_space_gb
170      Boolean? funcotator_use_ssd
171      Int? funcotator_cpu
172    }
173
174    Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_fasta_dict, "GB") + size(ref_fasta_fai, "GB"))
175    Int read_count_pon_size = ceil(size(read_count_pon, "GB"))
176    Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bam_idx, "GB"))
177    Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bam_idx, "GB")) else 0
178
179    Int gatk4_override_size = if defined(gatk4_jar_override) then ceil(size(gatk4_jar_override, "GB")) else 0
180    # This is added to every task as padding, should increase if systematically you need more disk for every call
181    Int disk_pad = 20 + ceil(size(intervals, "GB")) + ceil(size(common_sites, "GB")) + gatk4_override_size + select_first([emergency_extra_disk, 0])
182
183    File final_normal_bam = select_first([normal_bam, "null"])
184    File final_normal_bam_idx = select_first([normal_bam_idx, "null"])
185
186    Int preprocess_intervals_disk = ref_size + disk_pad
187    call CNVTasks.PreprocessIntervals {
188        input:
189            intervals = intervals,
190            blacklist_intervals = blacklist_intervals,
191            ref_fasta = ref_fasta,
192            ref_fasta_fai = ref_fasta_fai,
193            ref_fasta_dict = ref_fasta_dict,
194            padding = padding,
195            bin_length = bin_length,
196            gatk4_jar_override = gatk4_jar_override,
197            gatk_docker = gatk_docker,
198            mem_gb = mem_gb_for_preprocess_intervals,
199            disk_space_gb = preprocess_intervals_disk,
200            preemptible_attempts = preemptible_attempts
201    }
202
203    Int collect_counts_tumor_disk = tumor_bam_size + ceil(size(PreprocessIntervals.preprocessed_intervals, "GB")) + disk_pad
204    call CNVTasks.CollectCounts as CollectCountsTumor {
205        input:
206            intervals = PreprocessIntervals.preprocessed_intervals,
207            bam = tumor_bam,
208            bam_idx = tumor_bam_idx,
209            ref_fasta = ref_fasta,
210            ref_fasta_fai = ref_fasta_fai,
211            ref_fasta_dict = ref_fasta_dict,
212            format = collect_counts_format,
213            enable_indexing = false,
214            gatk4_jar_override = gatk4_jar_override,
215            gatk_docker = gatk_docker,
216            mem_gb = mem_gb_for_collect_counts,
217            disk_space_gb = collect_counts_tumor_disk,
218            preemptible_attempts = preemptible_attempts,
219            gcs_project_for_requester_pays = gcs_project_for_requester_pays
220    }
221
222    Int collect_allelic_counts_tumor_disk = tumor_bam_size + ref_size + disk_pad
223    call CNVTasks.CollectAllelicCounts as CollectAllelicCountsTumor {
224        input:
225            common_sites = common_sites,
226            bam = tumor_bam,
227            bam_idx = tumor_bam_idx,
228            ref_fasta = ref_fasta,
229            ref_fasta_dict = ref_fasta_dict,
230            ref_fasta_fai = ref_fasta_fai,
231            minimum_base_quality =  minimum_base_quality,
232            gatk4_jar_override = gatk4_jar_override,
233            gatk_docker = gatk_docker,
234            mem_gb = mem_gb_for_collect_allelic_counts,
235            disk_space_gb = collect_allelic_counts_tumor_disk,
236            preemptible_attempts = preemptible_attempts,
237            gcs_project_for_requester_pays = gcs_project_for_requester_pays
238    }
239
240    Int denoise_read_counts_tumor_disk = read_count_pon_size + ceil(size(CollectCountsTumor.counts, "GB")) + disk_pad
241    call DenoiseReadCounts as DenoiseReadCountsTumor {
242        input:
243            entity_id = CollectCountsTumor.entity_id,
244            read_counts = CollectCountsTumor.counts,
245            read_count_pon = read_count_pon,
246            number_of_eigensamples = number_of_eigensamples,
247            gatk4_jar_override = gatk4_jar_override,
248            gatk_docker = gatk_docker,
249            mem_gb = mem_gb_for_denoise_read_counts,
250            disk_space_gb = denoise_read_counts_tumor_disk,
251            preemptible_attempts = preemptible_attempts
252    }
253
254    Int model_segments_normal_portion = if defined(normal_bam) then ceil(size(CollectAllelicCountsNormal.allelic_counts, "GB")) else 0
255    Int model_segments_tumor_disk = ceil(size(DenoiseReadCountsTumor.denoised_copy_ratios, "GB")) + ceil(size(CollectAllelicCountsTumor.allelic_counts, "GB")) + model_segments_normal_portion + disk_pad
256    call ModelSegments as ModelSegmentsTumor {
257        input:
258            entity_id = CollectCountsTumor.entity_id,
259            denoised_copy_ratios = DenoiseReadCountsTumor.denoised_copy_ratios,
260            allelic_counts = CollectAllelicCountsTumor.allelic_counts,
261            normal_allelic_counts = CollectAllelicCountsNormal.allelic_counts,
262            max_num_segments_per_chromosome = max_num_segments_per_chromosome,
263            min_total_allele_count = min_total_allele_count,
264            min_total_allele_count_normal = min_total_allele_count_normal,
265            genotyping_homozygous_log_ratio_threshold = genotyping_homozygous_log_ratio_threshold,
266            genotyping_base_error_rate = genotyping_base_error_rate,
267            kernel_variance_copy_ratio = kernel_variance_copy_ratio,
268            kernel_variance_allele_fraction = kernel_variance_allele_fraction,
269            kernel_scaling_allele_fraction = kernel_scaling_allele_fraction,
270            kernel_approximation_dimension = kernel_approximation_dimension,
271            window_sizes = window_sizes,
272            num_changepoints_penalty_factor = num_changepoints_penalty_factor,
273            minor_allele_fraction_prior_alpha = minor_allele_fraction_prior_alpha,
274            num_samples_copy_ratio = num_samples_copy_ratio,
275            num_burn_in_copy_ratio = num_burn_in_copy_ratio,
276            num_samples_allele_fraction = num_samples_allele_fraction,
277            num_burn_in_allele_fraction = num_burn_in_allele_fraction,
278            smoothing_threshold_copy_ratio = smoothing_threshold_copy_ratio,
279            smoothing_threshold_allele_fraction = smoothing_threshold_allele_fraction,
280            max_num_smoothing_iterations = max_num_smoothing_iterations,
281            num_smoothing_iterations_per_fit = num_smoothing_iterations_per_fit,
282            gatk4_jar_override = gatk4_jar_override,
283            gatk_docker = gatk_docker,
284            mem_gb = mem_gb_for_model_segments,
285            disk_space_gb = model_segments_tumor_disk,
286            preemptible_attempts = preemptible_attempts
287    }
288
289    Int copy_ratio_segments_tumor_disk = ceil(size(DenoiseReadCountsTumor.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsTumor.copy_ratio_only_segments, "GB")) + disk_pad
290    call CallCopyRatioSegments as CallCopyRatioSegmentsTumor {
291        input:
292            entity_id = CollectCountsTumor.entity_id,
293            copy_ratio_segments = ModelSegmentsTumor.copy_ratio_only_segments,
294            neutral_segment_copy_ratio_lower_bound = neutral_segment_copy_ratio_lower_bound,
295            neutral_segment_copy_ratio_upper_bound = neutral_segment_copy_ratio_upper_bound,
296            outlier_neutral_segment_copy_ratio_z_score_threshold = outlier_neutral_segment_copy_ratio_z_score_threshold,
297            calling_copy_ratio_z_score_threshold = calling_copy_ratio_z_score_threshold,
298            gatk4_jar_override = gatk4_jar_override,
299            gatk_docker = gatk_docker,
300            mem_gb = mem_gb_for_call_copy_ratio_segments,
301            disk_space_gb = copy_ratio_segments_tumor_disk,
302            preemptible_attempts = preemptible_attempts
303    }
304
305    # The F=files from other tasks are small enough to just combine into one disk variable and pass to the tumor plotting tasks
306    Int plot_tumor_disk = ref_size + ceil(size(DenoiseReadCountsTumor.standardized_copy_ratios, "GB")) + ceil(size(DenoiseReadCountsTumor.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsTumor.het_allelic_counts, "GB")) + ceil(size(ModelSegmentsTumor.modeled_segments, "GB")) + disk_pad
307    call PlotDenoisedCopyRatios as PlotDenoisedCopyRatiosTumor {
308        input:
309            entity_id = CollectCountsTumor.entity_id,
310            standardized_copy_ratios = DenoiseReadCountsTumor.standardized_copy_ratios,
311            denoised_copy_ratios = DenoiseReadCountsTumor.denoised_copy_ratios,
312            ref_fasta_dict = ref_fasta_dict,
313            minimum_contig_length = minimum_contig_length,
314            maximum_copy_ratio = maximum_copy_ratio,
315            point_size_copy_ratio = point_size_copy_ratio,
316            gatk4_jar_override = gatk4_jar_override,
317            gatk_docker = gatk_docker,
318            mem_gb = mem_gb_for_plotting,
319            disk_space_gb = plot_tumor_disk,
320            preemptible_attempts = preemptible_attempts
321    }
322
323    call PlotModeledSegments as PlotModeledSegmentsTumor {
324        input:
325            entity_id = CollectCountsTumor.entity_id,
326            denoised_copy_ratios = DenoiseReadCountsTumor.denoised_copy_ratios,
327            het_allelic_counts = ModelSegmentsTumor.het_allelic_counts,
328            modeled_segments = ModelSegmentsTumor.modeled_segments,
329            ref_fasta_dict = ref_fasta_dict,
330            minimum_contig_length = minimum_contig_length,
331            maximum_copy_ratio = maximum_copy_ratio,
332            point_size_copy_ratio = point_size_copy_ratio,
333            point_size_allele_fraction = point_size_allele_fraction,
334            gatk4_jar_override = gatk4_jar_override,
335            gatk_docker = gatk_docker,
336            mem_gb = mem_gb_for_plotting,
337            disk_space_gb = plot_tumor_disk,
338            preemptible_attempts = preemptible_attempts
339    }
340
341    Int collect_counts_normal_disk = normal_bam_size + ceil(size(PreprocessIntervals.preprocessed_intervals, "GB")) + disk_pad
342    if (defined(normal_bam)) {
343        call CNVTasks.CollectCounts as CollectCountsNormal {
344            input:
345                intervals = PreprocessIntervals.preprocessed_intervals,
346                bam = final_normal_bam,
347                bam_idx = final_normal_bam_idx,
348                ref_fasta = ref_fasta,
349                ref_fasta_fai = ref_fasta_fai,
350                ref_fasta_dict = ref_fasta_dict,
351                format = collect_counts_format,
352                enable_indexing = false,
353                gatk4_jar_override = gatk4_jar_override,
354                gatk_docker = gatk_docker,
355                mem_gb = mem_gb_for_collect_counts,
356                disk_space_gb = collect_counts_normal_disk,
357                preemptible_attempts = preemptible_attempts,
358                gcs_project_for_requester_pays = gcs_project_for_requester_pays
359        }
360
361        Int collect_allelic_counts_normal_disk = normal_bam_size + ref_size + disk_pad
362        call CNVTasks.CollectAllelicCounts as CollectAllelicCountsNormal {
363            input:
364                common_sites = common_sites,
365                bam = final_normal_bam,
366                bam_idx = final_normal_bam_idx,
367                ref_fasta = ref_fasta,
368                ref_fasta_dict = ref_fasta_dict,
369                ref_fasta_fai = ref_fasta_fai,
370                minimum_base_quality =  minimum_base_quality,
371                gatk4_jar_override = gatk4_jar_override,
372                gatk_docker = gatk_docker,
373                mem_gb = mem_gb_for_collect_allelic_counts,
374                disk_space_gb = collect_allelic_counts_normal_disk,
375                preemptible_attempts = preemptible_attempts,
376                gcs_project_for_requester_pays = gcs_project_for_requester_pays
377        }
378
379        Int denoise_read_counts_normal_disk = read_count_pon_size + ceil(size(CollectCountsNormal.counts, "GB")) + disk_pad
380        call DenoiseReadCounts as DenoiseReadCountsNormal {
381            input:
382                entity_id = CollectCountsNormal.entity_id,
383                read_counts = CollectCountsNormal.counts,
384                read_count_pon = read_count_pon,
385                number_of_eigensamples = number_of_eigensamples,
386                gatk4_jar_override = gatk4_jar_override,
387                gatk_docker = gatk_docker,
388                mem_gb = mem_gb_for_denoise_read_counts,
389                disk_space_gb = denoise_read_counts_normal_disk,
390                preemptible_attempts = preemptible_attempts
391        }
392
393        Int model_segments_normal_disk = ceil(size(DenoiseReadCountsNormal.denoised_copy_ratios, "GB")) + ceil(size(CollectAllelicCountsNormal.allelic_counts, "GB")) + disk_pad
394        call ModelSegments as ModelSegmentsNormal {
395            input:
396                entity_id = CollectCountsNormal.entity_id,
397                denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios,
398                allelic_counts = CollectAllelicCountsNormal.allelic_counts,
399                max_num_segments_per_chromosome = max_num_segments_per_chromosome,
400                min_total_allele_count = min_total_allele_count_normal,
401                genotyping_homozygous_log_ratio_threshold = genotyping_homozygous_log_ratio_threshold,
402                genotyping_base_error_rate = genotyping_base_error_rate,
403                kernel_variance_copy_ratio = kernel_variance_copy_ratio,
404                kernel_variance_allele_fraction = kernel_variance_allele_fraction,
405                kernel_scaling_allele_fraction = kernel_scaling_allele_fraction,
406                kernel_approximation_dimension = kernel_approximation_dimension,
407                window_sizes = window_sizes,
408                num_changepoints_penalty_factor = num_changepoints_penalty_factor,
409                minor_allele_fraction_prior_alpha = minor_allele_fraction_prior_alpha,
410                num_samples_copy_ratio = num_samples_copy_ratio,
411                num_burn_in_copy_ratio = num_burn_in_copy_ratio,
412                num_samples_allele_fraction = num_samples_allele_fraction,
413                num_burn_in_allele_fraction = num_burn_in_allele_fraction,
414                smoothing_threshold_copy_ratio = smoothing_threshold_copy_ratio,
415                smoothing_threshold_allele_fraction = smoothing_threshold_allele_fraction,
416                max_num_smoothing_iterations = max_num_smoothing_iterations,
417                num_smoothing_iterations_per_fit = num_smoothing_iterations_per_fit,
418                gatk4_jar_override = gatk4_jar_override,
419                gatk_docker = gatk_docker,
420                mem_gb = mem_gb_for_model_segments,
421                disk_space_gb = model_segments_normal_disk,
422                preemptible_attempts = preemptible_attempts
423        }
424
425        Int copy_ratio_segments_normal_disk = ceil(size(DenoiseReadCountsNormal.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsNormal.copy_ratio_only_segments, "GB")) + disk_pad
426        call CallCopyRatioSegments as CallCopyRatioSegmentsNormal {
427            input:
428                entity_id = CollectCountsNormal.entity_id,
429                copy_ratio_segments = ModelSegmentsNormal.copy_ratio_only_segments,
430                neutral_segment_copy_ratio_lower_bound = neutral_segment_copy_ratio_lower_bound,
431                neutral_segment_copy_ratio_upper_bound = neutral_segment_copy_ratio_upper_bound,
432                outlier_neutral_segment_copy_ratio_z_score_threshold = outlier_neutral_segment_copy_ratio_z_score_threshold,
433                calling_copy_ratio_z_score_threshold = calling_copy_ratio_z_score_threshold,
434                gatk4_jar_override = gatk4_jar_override,
435                gatk_docker = gatk_docker,
436                mem_gb = mem_gb_for_call_copy_ratio_segments,
437                disk_space_gb = copy_ratio_segments_normal_disk,
438                preemptible_attempts = preemptible_attempts
439        }
440
441        # The files from other tasks are small enough to just combine into one disk variable and pass to the normal plotting tasks
442        Int plot_normal_disk = ref_size + ceil(size(DenoiseReadCountsNormal.standardized_copy_ratios, "GB")) + ceil(size(DenoiseReadCountsNormal.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsNormal.het_allelic_counts, "GB")) + ceil(size(ModelSegmentsNormal.modeled_segments, "GB")) + disk_pad
443        call PlotDenoisedCopyRatios as PlotDenoisedCopyRatiosNormal {
444            input:
445                entity_id = CollectCountsNormal.entity_id,
446                standardized_copy_ratios = DenoiseReadCountsNormal.standardized_copy_ratios,
447                denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios,
448                ref_fasta_dict = ref_fasta_dict,
449                minimum_contig_length = minimum_contig_length,
450                maximum_copy_ratio = maximum_copy_ratio,
451                point_size_copy_ratio = point_size_copy_ratio,
452                gatk4_jar_override = gatk4_jar_override,
453                gatk_docker = gatk_docker,
454                mem_gb = mem_gb_for_plotting,
455                disk_space_gb = plot_normal_disk,
456                preemptible_attempts = preemptible_attempts
457        }
458
459        call PlotModeledSegments as PlotModeledSegmentsNormal {
460            input:
461                entity_id = CollectCountsNormal.entity_id,
462                denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios,
463                het_allelic_counts = ModelSegmentsNormal.het_allelic_counts,
464                modeled_segments = ModelSegmentsNormal.modeled_segments,
465                ref_fasta_dict = ref_fasta_dict,
466                minimum_contig_length = minimum_contig_length,
467                maximum_copy_ratio = maximum_copy_ratio,
468                point_size_copy_ratio = point_size_copy_ratio,
469                point_size_allele_fraction = point_size_allele_fraction,
470                gatk4_jar_override = gatk4_jar_override,
471                gatk_docker = gatk_docker,
472                mem_gb = mem_gb_for_plotting,
473                disk_space_gb = plot_normal_disk,
474                preemptible_attempts = preemptible_attempts
475        }
476    }
477
478    if (select_first([is_run_oncotator, false])) {
479        call CNVOncotator.CNVOncotatorWorkflow as CNVOncotatorWorkflow {
480            input:
481                 called_file = CallCopyRatioSegmentsTumor.called_copy_ratio_segments,
482                 additional_args = additional_args_for_oncotator,
483                 oncotator_docker = oncotator_docker,
484                 mem_gb_for_oncotator = mem_gb_for_oncotator,
485                 boot_disk_space_gb_for_oncotator = boot_disk_space_gb_for_oncotator,
486                 preemptible_attempts = preemptible_attempts
487        }
488    }
489    if (select_first([is_run_funcotator, false])) {
490        call CNVFuncotateSegments.CNVFuncotateSegmentsWorkflow as CNVFuncotateSegmentsWorkflow {
491            input:
492                 input_seg_file = CallCopyRatioSegmentsTumor.called_copy_ratio_segments,
493                 funcotator_ref_version = select_first([funcotator_ref_version, "hg19"]),
494                 extra_args = additional_args_for_funcotator,
495                 ref_fasta = ref_fasta,
496                 ref_fasta_fai = ref_fasta_fai,
497                 ref_fasta_dict = ref_fasta_dict,
498                 transcript_selection_list = funcotator_transcript_selection_list,
499                 funcotator_data_sources_tar_gz = funcotator_data_sources_tar_gz,
500                 gatk4_jar_override = gatk4_jar_override,
501                 gatk_docker = gatk_docker,
502                 mem_gb = mem_gb_for_funcotator,
503                 preemptible_attempts = preemptible_attempts,
504                 transcript_selection_mode = funcotator_transcript_selection_mode,
505                 annotation_defaults = funcotator_annotation_defaults,
506                 annotation_overrides = funcotator_annotation_overrides,
507                 funcotator_excluded_fields = funcotator_excluded_fields,
508                 is_removing_untared_datasources = funcotator_is_removing_untared_datasources,
509                 disk_space_gb = funcotator_disk_space_gb,
510                 use_ssd = funcotator_use_ssd,
511                 cpu = funcotator_cpu
512        }
513    }
514
515    output {
516        File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals
517
518        File read_counts_entity_id_tumor = CollectCountsTumor.entity_id
519        File read_counts_tumor = CollectCountsTumor.counts
520        File allelic_counts_entity_id_tumor = CollectAllelicCountsTumor.entity_id
521        File allelic_counts_tumor = CollectAllelicCountsTumor.allelic_counts
522        File denoised_copy_ratios_tumor = DenoiseReadCountsTumor.denoised_copy_ratios
523        File standardized_copy_ratios_tumor = DenoiseReadCountsTumor.standardized_copy_ratios
524        File het_allelic_counts_tumor = ModelSegmentsTumor.het_allelic_counts
525        File normal_het_allelic_counts_tumor = ModelSegmentsTumor.normal_het_allelic_counts
526        File copy_ratio_only_segments_tumor = ModelSegmentsTumor.copy_ratio_only_segments
527        File copy_ratio_legacy_segments_tumor = ModelSegmentsTumor.copy_ratio_legacy_segments
528        File allele_fraction_legacy_segments_tumor = ModelSegmentsTumor.allele_fraction_legacy_segments
529        File modeled_segments_begin_tumor = ModelSegmentsTumor.modeled_segments_begin
530        File copy_ratio_parameters_begin_tumor = ModelSegmentsTumor.copy_ratio_parameters_begin
531        File allele_fraction_parameters_begin_tumor = ModelSegmentsTumor.allele_fraction_parameters_begin
532        File modeled_segments_tumor = ModelSegmentsTumor.modeled_segments
533        File copy_ratio_parameters_tumor = ModelSegmentsTumor.copy_ratio_parameters
534        File allele_fraction_parameters_tumor = ModelSegmentsTumor.allele_fraction_parameters
535        File called_copy_ratio_segments_tumor = CallCopyRatioSegmentsTumor.called_copy_ratio_segments
536        File called_copy_ratio_legacy_segments_tumor = CallCopyRatioSegmentsTumor.called_copy_ratio_legacy_segments
537        File denoised_copy_ratios_plot_tumor = PlotDenoisedCopyRatiosTumor.denoised_copy_ratios_plot
538        File standardized_MAD_tumor = PlotDenoisedCopyRatiosTumor.standardized_MAD
539        Float standardized_MAD_value_tumor = PlotDenoisedCopyRatiosTumor.standardized_MAD_value
540        File denoised_MAD_tumor = PlotDenoisedCopyRatiosTumor.denoised_MAD
541        Float denoised_MAD_value_tumor = PlotDenoisedCopyRatiosTumor.denoised_MAD_value
542        File delta_MAD_tumor = PlotDenoisedCopyRatiosTumor.delta_MAD
543        Float delta_MAD_value_tumor = PlotDenoisedCopyRatiosTumor.delta_MAD_value
544        File scaled_delta_MAD_tumor = PlotDenoisedCopyRatiosTumor.scaled_delta_MAD
545        Float scaled_delta_MAD_value_tumor = PlotDenoisedCopyRatiosTumor.scaled_delta_MAD_value
546        File modeled_segments_plot_tumor = PlotModeledSegmentsTumor.modeled_segments_plot
547
548        File? read_counts_entity_id_normal = CollectCountsNormal.entity_id
549        File? read_counts_normal = CollectCountsNormal.counts
550        File? allelic_counts_entity_id_normal = CollectAllelicCountsNormal.entity_id
551        File? allelic_counts_normal = CollectAllelicCountsNormal.allelic_counts
552        File? denoised_copy_ratios_normal = DenoiseReadCountsNormal.denoised_copy_ratios
553        File? standardized_copy_ratios_normal = DenoiseReadCountsNormal.standardized_copy_ratios
554        File? het_allelic_counts_normal = ModelSegmentsNormal.het_allelic_counts
555        File? normal_het_allelic_counts_normal = ModelSegmentsNormal.normal_het_allelic_counts
556        File? copy_ratio_only_segments_normal = ModelSegmentsNormal.copy_ratio_only_segments
557        File? copy_ratio_legacy_segments_normal = ModelSegmentsNormal.copy_ratio_legacy_segments
558        File? allele_fraction_legacy_segments_normal = ModelSegmentsNormal.allele_fraction_legacy_segments
559        File? modeled_segments_begin_normal = ModelSegmentsNormal.modeled_segments_begin
560        File? copy_ratio_parameters_begin_normal = ModelSegmentsNormal.copy_ratio_parameters_begin
561        File? allele_fraction_parameters_begin_normal = ModelSegmentsNormal.allele_fraction_parameters_begin
562        File? modeled_segments_normal = ModelSegmentsNormal.modeled_segments
563        File? copy_ratio_parameters_normal = ModelSegmentsNormal.copy_ratio_parameters
564        File? allele_fraction_parameters_normal = ModelSegmentsNormal.allele_fraction_parameters
565        File? called_copy_ratio_segments_normal = CallCopyRatioSegmentsNormal.called_copy_ratio_segments
566        File? called_copy_ratio_legacy_segments_normal = CallCopyRatioSegmentsNormal.called_copy_ratio_legacy_segments
567        File? denoised_copy_ratios_plot_normal = PlotDenoisedCopyRatiosNormal.denoised_copy_ratios_plot
568        File? standardized_MAD_normal = PlotDenoisedCopyRatiosNormal.standardized_MAD
569        Float? standardized_MAD_value_normal = PlotDenoisedCopyRatiosNormal.standardized_MAD_value
570        File? denoised_MAD_normal = PlotDenoisedCopyRatiosNormal.denoised_MAD
571        Float? denoised_MAD_value_normal = PlotDenoisedCopyRatiosNormal.denoised_MAD_value
572        File? delta_MAD_normal = PlotDenoisedCopyRatiosNormal.delta_MAD
573        Float? delta_MAD_value_normal = PlotDenoisedCopyRatiosNormal.delta_MAD_value
574        File? scaled_delta_MAD_normal = PlotDenoisedCopyRatiosNormal.scaled_delta_MAD
575        Float? scaled_delta_MAD_value_normal = PlotDenoisedCopyRatiosNormal.scaled_delta_MAD_value
576        File? modeled_segments_plot_normal = PlotModeledSegmentsNormal.modeled_segments_plot
577
578        File oncotated_called_file_tumor = select_first([CNVOncotatorWorkflow.oncotated_called_file, "null"])
579        File oncotated_called_gene_list_file_tumor = select_first([CNVOncotatorWorkflow.oncotated_called_gene_list_file, "null"])
580        File funcotated_called_file_tumor = select_first([CNVFuncotateSegmentsWorkflow.funcotated_seg_simple_tsv, "null"])
581        File funcotated_called_gene_list_file_tumor = select_first([CNVFuncotateSegmentsWorkflow.funcotated_gene_list_tsv, "null"])
582    }
583}
584
585task DenoiseReadCounts {
586    input {
587      String entity_id
588      File read_counts
589      File read_count_pon
590      Int? number_of_eigensamples #use all eigensamples in panel by default
591      File? gatk4_jar_override
592
593      # Runtime parameters
594      String gatk_docker
595      Int? mem_gb
596      Int? disk_space_gb
597      Boolean use_ssd = false
598      Int? cpu
599      Int? preemptible_attempts
600    }
601
602    Int machine_mem_mb = select_first([mem_gb, 13]) * 1000
603    Int command_mem_mb = machine_mem_mb - 1000
604
605    command <<<
606        set -e
607        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}
608
609        gatk --java-options "-Xmx~{command_mem_mb}m" DenoiseReadCounts \
610            --input ~{read_counts} \
611            --count-panel-of-normals ~{read_count_pon} \
612            ~{"--number-of-eigensamples " + number_of_eigensamples} \
613            --standardized-copy-ratios ~{entity_id}.standardizedCR.tsv \
614            --denoised-copy-ratios ~{entity_id}.denoisedCR.tsv
615    >>>
616
617    runtime {
618        docker: "~{gatk_docker}"
619        memory: machine_mem_mb + " MB"
620        disks: "local-disk " + disk_space_gb + if use_ssd then " SSD" else " HDD"
621        cpu: select_first([cpu, 1])
622        preemptible: select_first([preemptible_attempts, 5])
623    }
624
625    output {
626        File standardized_copy_ratios = "~{entity_id}.standardizedCR.tsv"
627        File denoised_copy_ratios = "~{entity_id}.denoisedCR.tsv"
628    }
629}
630
631task ModelSegments {
632    input {
633      String entity_id
634      File denoised_copy_ratios
635      File allelic_counts
636      File? normal_allelic_counts
637      Int? max_num_segments_per_chromosome
638      Int? min_total_allele_count
639      Int? min_total_allele_count_normal
640      Float? genotyping_homozygous_log_ratio_threshold
641      Float? genotyping_base_error_rate
642      Float? kernel_variance_copy_ratio
643      Float? kernel_variance_allele_fraction
644      Float? kernel_scaling_allele_fraction
645      Int? kernel_approximation_dimension
646      Array[Int]+? window_sizes = [8, 16, 32, 64, 128, 256]
647      Float? num_changepoints_penalty_factor
648      Float? minor_allele_fraction_prior_alpha
649      Int? num_samples_copy_ratio
650      Int? num_burn_in_copy_ratio
651      Int? num_samples_allele_fraction
652      Int? num_burn_in_allele_fraction
653      Float? smoothing_threshold_copy_ratio
654      Float? smoothing_threshold_allele_fraction
655      Int? max_num_smoothing_iterations
656      Int? num_smoothing_iterations_per_fit
657      String? output_dir
658      File? gatk4_jar_override
659
660      # Runtime parameters
661      String gatk_docker
662      Int? mem_gb
663      Int? disk_space_gb
664      Boolean use_ssd = false
665      Int? cpu
666      Int? preemptible_attempts
667    }
668
669    Int machine_mem_mb = select_first([mem_gb, 13]) * 1000
670    # ModelSegments seems to need at least 3GB of overhead to run
671    Int command_mem_mb = machine_mem_mb - 3000
672
673    # If optional output_dir not specified, use "out"
674    String output_dir_ = select_first([output_dir, "out"])
675
676    # default values are min_total_allele_count_ = 0 in matched-normal mode
677    #                                            = 30 in case-only mode
678    Int default_min_total_allele_count = if defined(normal_allelic_counts) then 0 else 30
679    Int min_total_allele_count_ = select_first([min_total_allele_count, default_min_total_allele_count])
680
681    command <<<
682        set -e
683        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}
684
685        gatk --java-options "-Xmx~{command_mem_mb}m" ModelSegments \
686            --denoised-copy-ratios ~{denoised_copy_ratios} \
687            --allelic-counts ~{allelic_counts} \
688            ~{"--normal-allelic-counts " + normal_allelic_counts} \
689            --minimum-total-allele-count-case ~{min_total_allele_count_} \
690            --minimum-total-allele-count-normal ~{default="30" min_total_allele_count_normal} \
691            --genotyping-homozygous-log-ratio-threshold ~{default="-10.0" genotyping_homozygous_log_ratio_threshold} \
692            --genotyping-base-error-rate ~{default="0.05" genotyping_base_error_rate} \
693            --maximum-number-of-segments-per-chromosome ~{default="1000" max_num_segments_per_chromosome} \
694            --kernel-variance-copy-ratio ~{default="0.0" kernel_variance_copy_ratio} \
695            --kernel-variance-allele-fraction ~{default="0.025" kernel_variance_allele_fraction} \
696            --kernel-scaling-allele-fraction ~{default="1.0" kernel_scaling_allele_fraction} \
697            --kernel-approximation-dimension ~{default="100" kernel_approximation_dimension} \
698            --window-size ~{sep=" --window-size " window_sizes} \
699            --number-of-changepoints-penalty-factor ~{default="1.0" num_changepoints_penalty_factor} \
700            --minor-allele-fraction-prior-alpha ~{default="25.0" minor_allele_fraction_prior_alpha} \
701            --number-of-samples-copy-ratio ~{default="100" num_samples_copy_ratio} \
702            --number-of-burn-in-samples-copy-ratio ~{default="50" num_burn_in_copy_ratio} \
703            --number-of-samples-allele-fraction ~{default="100" num_samples_allele_fraction} \
704            --number-of-burn-in-samples-allele-fraction ~{default="50" num_burn_in_allele_fraction} \
705            --smoothing-credible-interval-threshold-copy-ratio ~{default="2.0" smoothing_threshold_copy_ratio} \
706            --smoothing-credible-interval-threshold-allele-fraction ~{default="2.0" smoothing_threshold_allele_fraction} \
707            --maximum-number-of-smoothing-iterations ~{default="10" max_num_smoothing_iterations} \
708            --number-of-smoothing-iterations-per-fit ~{default="0" num_smoothing_iterations_per_fit} \
709            --output ~{output_dir_} \
710            --output-prefix ~{entity_id}
711
712        # We need to create the file even if the above command doesn't so we have something to delocalize
713        # If no file is created by the above task then it will copy out an empty file
714        touch ~{output_dir_}/~{entity_id}.hets.normal.tsv
715    >>>
716
717    runtime {
718        docker: "~{gatk_docker}"
719        memory: machine_mem_mb + " MB"
720        disks: "local-disk " + disk_space_gb + if use_ssd then " SSD" else " HDD"
721        cpu: select_first([cpu, 1])
722        preemptible: select_first([preemptible_attempts, 5])
723    }
724
725    output {
726        File het_allelic_counts = "~{output_dir_}/~{entity_id}.hets.tsv"
727        File normal_het_allelic_counts = "~{output_dir_}/~{entity_id}.hets.normal.tsv"
728        File copy_ratio_only_segments = "~{output_dir_}/~{entity_id}.cr.seg"
729        File copy_ratio_legacy_segments = "~{output_dir_}/~{entity_id}.cr.igv.seg"
730        File allele_fraction_legacy_segments = "~{output_dir_}/~{entity_id}.af.igv.seg"
731        File modeled_segments_begin = "~{output_dir_}/~{entity_id}.modelBegin.seg"
732        File copy_ratio_parameters_begin = "~{output_dir_}/~{entity_id}.modelBegin.cr.param"
733        File allele_fraction_parameters_begin = "~{output_dir_}/~{entity_id}.modelBegin.af.param"
734        File modeled_segments = "~{output_dir_}/~{entity_id}.modelFinal.seg"
735        File copy_ratio_parameters = "~{output_dir_}/~{entity_id}.modelFinal.cr.param"
736        File allele_fraction_parameters = "~{output_dir_}/~{entity_id}.modelFinal.af.param"
737    }
738}
739
740task CallCopyRatioSegments {
741    input {
742      String entity_id
743      File copy_ratio_segments
744      Float? neutral_segment_copy_ratio_lower_bound
745      Float? neutral_segment_copy_ratio_upper_bound
746      Float? outlier_neutral_segment_copy_ratio_z_score_threshold
747      Float? calling_copy_ratio_z_score_threshold
748      File? gatk4_jar_override
749
750      # Runtime parameters
751      String gatk_docker
752      Int? mem_gb
753      Int? disk_space_gb
754      Boolean use_ssd = false
755      Int? cpu
756      Int? preemptible_attempts
757    }
758
759    Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
760    Int command_mem_mb = machine_mem_mb - 1000
761
762    command <<<
763        set -e
764        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}
765
766        gatk --java-options "-Xmx~{command_mem_mb}m" CallCopyRatioSegments \
767            --input ~{copy_ratio_segments} \
768            --neutral-segment-copy-ratio-lower-bound ~{default="0.9" neutral_segment_copy_ratio_lower_bound} \
769            --neutral-segment-copy-ratio-upper-bound ~{default="1.1" neutral_segment_copy_ratio_upper_bound} \
770            --outlier-neutral-segment-copy-ratio-z-score-threshold ~{default="2.0" outlier_neutral_segment_copy_ratio_z_score_threshold} \
771            --calling-copy-ratio-z-score-threshold ~{default="2.0" calling_copy_ratio_z_score_threshold} \
772            --output ~{entity_id}.called.seg
773    >>>
774
775    runtime {
776        docker: "~{gatk_docker}"
777        memory: machine_mem_mb + " MB"
778        disks: "local-disk " + disk_space_gb + if use_ssd then " SSD" else " HDD"
779        cpu: select_first([cpu, 1])
780        preemptible: select_first([preemptible_attempts, 5])
781    }
782
783    output {
784        File called_copy_ratio_segments = "~{entity_id}.called.seg"
785        File called_copy_ratio_legacy_segments = "~{entity_id}.called.igv.seg"
786    }
787}
788
789task PlotDenoisedCopyRatios {
790    input {
791      String entity_id
792      File standardized_copy_ratios
793      File denoised_copy_ratios
794      File ref_fasta_dict
795      Int? minimum_contig_length
796      String? maximum_copy_ratio
797      Float? point_size_copy_ratio
798      String? output_dir
799      File? gatk4_jar_override
800
801      # Runtime parameters
802      String gatk_docker
803      Int? mem_gb
804      Int? disk_space_gb
805      Boolean use_ssd = false
806      Int? cpu
807      Int? preemptible_attempts
808    }
809
810    Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
811    Int command_mem_mb = machine_mem_mb - 1000
812
813    # If optional output_dir not specified, use "out"
814    String output_dir_ = select_first([output_dir, "out"])
815
816    command <<<
817        set -e
818        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}
819
820        gatk --java-options "-Xmx~{command_mem_mb}m" PlotDenoisedCopyRatios \
821            --standardized-copy-ratios ~{standardized_copy_ratios} \
822            --denoised-copy-ratios ~{denoised_copy_ratios} \
823            --sequence-dictionary ~{ref_fasta_dict} \
824            --minimum-contig-length ~{default="1000000" minimum_contig_length} \
825            --maximum-copy-ratio ~{default="4.0" maximum_copy_ratio} \
826            --point-size-copy-ratio ~{default="0.2" point_size_copy_ratio} \
827            --output ~{output_dir_} \
828            --output-prefix ~{entity_id}
829    >>>
830
831    runtime {
832        docker: "~{gatk_docker}"
833        memory: machine_mem_mb + " MB"
834        disks: "local-disk " + disk_space_gb + if use_ssd then " SSD" else " HDD"
835        cpu: select_first([cpu, 1])
836        preemptible: select_first([preemptible_attempts, 5])
837    }
838
839    output {
840        File denoised_copy_ratios_plot = "~{output_dir_}/~{entity_id}.denoised.png"
841        File standardized_MAD = "~{output_dir_}/~{entity_id}.standardizedMAD.txt"
842        Float standardized_MAD_value = read_float(standardized_MAD)
843        File denoised_MAD = "~{output_dir_}/~{entity_id}.denoisedMAD.txt"
844        Float denoised_MAD_value = read_float(denoised_MAD)
845        File delta_MAD = "~{output_dir_}/~{entity_id}.deltaMAD.txt"
846        Float delta_MAD_value = read_float(delta_MAD)
847        File scaled_delta_MAD = "~{output_dir_}/~{entity_id}.scaledDeltaMAD.txt"
848        Float scaled_delta_MAD_value = read_float(scaled_delta_MAD)
849    }
850}
851
852task PlotModeledSegments {
853    input {
854      String entity_id
855      File denoised_copy_ratios
856      File het_allelic_counts
857      File modeled_segments
858      File ref_fasta_dict
859      Int? minimum_contig_length
860      String? maximum_copy_ratio
861      Float? point_size_copy_ratio
862      Float? point_size_allele_fraction
863      String? output_dir
864      File? gatk4_jar_override
865
866      # Runtime parameters
867      String gatk_docker
868      Int? mem_gb
869      Int? disk_space_gb
870      Boolean use_ssd = false
871      Int? cpu
872      Int? preemptible_attempts
873    }
874
875    Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
876    Int command_mem_mb = machine_mem_mb - 1000
877
878    # If optional output_dir not specified, use "out"
879    String output_dir_ = select_first([output_dir, "out"])
880
881    command <<<
882        set -e
883        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}
884
885        gatk --java-options "-Xmx~{command_mem_mb}m" PlotModeledSegments \
886            --denoised-copy-ratios ~{denoised_copy_ratios} \
887            --allelic-counts ~{het_allelic_counts} \
888            --segments ~{modeled_segments} \
889            --sequence-dictionary ~{ref_fasta_dict} \
890            --minimum-contig-length ~{default="1000000" minimum_contig_length} \
891            --maximum-copy-ratio ~{default="4.0" maximum_copy_ratio} \
892            --point-size-copy-ratio ~{default="0.2" point_size_copy_ratio} \
893            --point-size-allele-fraction ~{default="0.4" point_size_allele_fraction} \
894            --output ~{output_dir_} \
895            --output-prefix ~{entity_id}
896    >>>
897
898    runtime {
899        docker: "~{gatk_docker}"
900        memory: machine_mem_mb + " MB"
901        disks: "local-disk " + disk_space_gb + if use_ssd then " SSD" else " HDD"
902        cpu: select_first([cpu, 1])
903        preemptible: select_first([preemptible_attempts, 5])
904    }
905
906    output {
907        File modeled_segments_plot = "~{output_dir_}/~{entity_id}.modeled.png"
908    }
909}
910