diff --git a/README.md b/README.md index 86f39c8..0167e9c 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,11 @@ nextflow run h3abionet/chipimputation/main.nf -profile test,singularity Check for results in `./output` +In case you are running the outdated version, run the code below prior to executing the pipeline. + +``` +nextflow pull h3abionet/chipimputation +``` ### Start running your own analysis diff --git a/main.nf b/main.nf index 145aa42..4506ee3 100644 --- a/main.nf +++ b/main.nf @@ -54,6 +54,8 @@ workflow preprocess { take: datasets main: + + //// check if study genotype files exist target_datasets = [] datasets.each { name, vcf -> @@ -159,20 +161,23 @@ workflow report_by_ref{ take: data main: - /// /// By Refecene panel - // combine by chrom, dataset, refpanel + // /// /// By Refecene panel + // // combine by chrom, dataset, refpanel imputeCombine_ref = data - .groupTuple( by:[1] ) - .map{ datasets, refpanel, vcfs, imputed_vcfs, imputed_infos -> [ refpanel, datasets.join(','), '', imputed_infos.join(',') ] } - filter_info_by_target( imputeCombine_ref ) - - /// change to group_by_maf + .groupTuple( by:[1,0] ) + // .map{ datasets, refpanel, vcfs, imputed_vcfs, imputed_infos -> [ refpanel, datasets.join(','), '', imputed_infos.join(',') ] } + .map{ datasets, refpanel, vcfs, imputed_vcfs, indexed_vcf, imputed_infos -> [ refpanel, datasets, '', imputed_infos ] } + // imputeCombine_ref.view() + combineInfo(imputeCombine_ref) + filter_info_by_target( combineInfo.out ) + + // /// change to group_by_maf report_well_imputed_by_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(wellInfo) ]} ) - //// Plot performance all targets by maf for a reference panel + // //// Plot performance all targets by maf for a reference panel plot_performance_target( report_well_imputed_by_target.out.map{ target_name, ref_panels, wellInfo, wellInfo_summary -> [ target_name, ref_panels, file(wellInfo), file(wellInfo_summary), 'DATASETS' ]} ) - //// Accuracy/Concordance + // //// Accuracy/Concordance report_accuracy_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(accInfo), 'DATASETS' ]} ) plot_accuracy_target ( report_accuracy_target.out ) emit: @@ -183,29 +188,33 @@ workflow report_by_dataset{ take: data main: - /// /// By Dataset - // combine by chrom, dataset, refpanel + // /// /// By Dataset + // // combine by chrom, dataset, refpanel imputeCombine_ref = data - .groupTuple( by:[0] ) - .map{ dataset, refpanels, vcfs, imputed_vcfs, imputed_infos -> [ dataset, refpanels.join(','), '', imputed_infos.join(',') ] } - filter_info_by_target( imputeCombine_ref ) + .groupTuple( by:[0,1] ) + // // .map{ dataset, refpanels, vcfs, imputed_vcfs, imputed_infos -> [ dataset, refpanels[0], '', imputed_infos.join(',') ] } + .map{ dataset, refpanels, vcfs, imputed_vcfs, indexed_vcfs, imputed_infos -> [ dataset, refpanels, '', imputed_infos ] } + // imputeCombine_ref.view() + combineInfo(imputeCombine_ref) + filter_info_by_target( combineInfo.out ) + - ///// Number of well imputed snps - /// change to group_by_maf + // ///// Number of well imputed snps + // // / change to group_by_maf report_well_imputed_by_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(wellInfo) ]} ) - /// Plot performance all targets by maf for a reference panel + // // / Plot performance all targets by maf for a reference panel plot_performance_target( report_well_imputed_by_target.out.map{ target_name, ref_panels, wellInfo, wellInfo_summary -> [ target_name, ref_panels, file(wellInfo), file(wellInfo_summary), 'REFERENCE_PANELS' ]} ) - //// Accuracy/Concordance + // //// Accuracy/Concordance report_accuracy_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(accInfo), 'REFERENCE_PANELS' ]} ) plot_accuracy_target ( report_accuracy_target.out ) - // Plot number of imputed SNPs over the mean r2 for all reference panels - input = imputeCombine_ref + // // Plot number of imputed SNPs over the mean r2 for all reference panels + input = combineInfo.out .map{ dataset, refpanels, chrm, infos -> [dataset, refpanels, infos]} - // Plot number of imputed SNPs over the mean r2 for all reference panels + // // Plot number of imputed SNPs over the mean r2 for all reference panels plot_r2_SNPcount(input) // Plot histograms of number of imputed SNPs over the mean r2 for all reference panels @@ -251,11 +260,12 @@ workflow { // Reporting // impute.out.chunks_imputed.view() impute_data = impute.out.chunks_imputed - .map{chr, fwd, rev, test_data, ref, imputed_vcf, - imputed_info, tst_data -> [test_data, ref, imputed_vcf, imputed_info]} + .map{chr, fwd, rev, test_data, ref, imputed_vcf, indexed_vcf, + imputed_info, tst_data -> [test_data, ref, imputed_vcf, indexed_vcf, imputed_info]} .combine(params.target_datasets, by:0) - .map {test_data, ref, imputed_vcf, imputed_info, orig_vcf - -> [test_data, ref, orig_vcf, imputed_vcf, imputed_info]} + .map {test_data, ref, imputed_vcf, indexed_vcf, imputed_info, orig_vcf + -> [test_data, ref, orig_vcf, imputed_vcf, indexed_vcf, imputed_info]} + // //// Report by Reference report_by_ref( impute_data ) @@ -263,28 +273,57 @@ workflow { // //// Report by datasets report_by_dataset( impute_data ) - // // Generate dataset frequencies - inp = Channel.fromList(params.ref_panels).map{ref, m3vcf, vcf -> [ref, vcf]} + // Combine vcfs + input = impute_data - .map{ target_name, ref_name, vcf, impute_vcf, info ->[ ref_name, target_name, file(impute_vcf)]} + .groupTuple( by:[0,1] ) + .map{ target_name, ref_name, vcf, impute_vcf, indexed_vcf, info ->[ target_name, ref_name, '', impute_vcf, indexed_vcf]} + // input.view() + combineImpute(input) + + // Combine Infos + imputeCombine_ref = impute_data + .groupTuple( by:[0,1] ) + .map{ dataset, refpanels, vcfs, imputed_vcfs, indexed_vcfs, imputed_infos -> [ dataset, refpanels, '', imputed_infos ] } + // imputeCombine_ref.view() + combineInfo(imputeCombine_ref) + + + // // Generate dataset frequencies + inp = Channel.fromList(params.combined_ref) + imp_input = combineImpute.out + .map{target, ref, chrm, vcf -> [ref, target, vcf]} .combine(inp, by:0) .map{ ref_name, target_name, impute_vcf, ref_vcf -> [target_name, ref_name, file(impute_vcf), file(ref_vcf)]} - generate_frequency(input) + generate_frequency(imp_input) // // Plot frequency Comparison - freq_comp = impute_data.map {target_name, ref_name, vcf, impute_vcf, info -> - [target_name, ref_name, info]} - .combine(generate_frequency.out, by:[0,1]) + freq_comp = combineInfo.out + .map{target, ref, chrm, info -> [target, ref, info]} + .combine(generate_frequency.out, by:[0,1]) plot_freq_comparison(freq_comp) + // // Plot number of imputed SNPs over the mean r2 for all reference panels - combineInfo_frq = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info, params.maf_thresh]} - .combine(generate_frequency.out, by:[0,1]) - .map { target_name, ref_name, info, maf_thresh, target_frq, ref_frq -> - [target_name, ref_name, info, maf_thresh, target_frq]} + // combineInfo_frq = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info, params.maf_thresh]} + // .combine(generate_frequency.out, by:[0,1]) + // .map { target_name, ref_name, info, maf_thresh, target_frq, ref_frq -> + // [target_name, ref_name, info, maf_thresh, target_frq]} + // plot_r2_SNPpos(combineInfo_frq) + + combineInfo_frq = combineInfo.out + .map{target, ref, chrm, info -> [target, ref, info, params.maf_thresh]} + .combine(generate_frequency.out, by:[0,1]) + .map { target_name, ref_name, info, maf_thresh, target_frq, ref_frq -> + [target_name, ref_name, info, maf_thresh, target_frq]} + // combineInfo_frq.view() plot_r2_SNPpos(combineInfo_frq) // // compute for average rsquared values - rsquared_input = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info]} + // rsquared_input = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info]} + // average_r2(rsquared_input) + rsquared_input = combineInfo.out + .map{target, ref, chrm, info -> [target, ref, info]} average_r2(rsquared_input) + } \ No newline at end of file diff --git a/modules/impute.nf b/modules/impute.nf index d5f6ffe..7f7eedb 100644 --- a/modules/impute.nf +++ b/modules/impute.nf @@ -31,9 +31,11 @@ process impute_minimac4 { input: tuple val(chrm), val(chunk_start), val(chunk_end), val(target_name), file(target_phased_vcf), file(target_phased_vcf_tbi), val(ref_name), file(ref_vcf), file(ref_m3vcf), val(tagName) output: - tuple val(chrm), val(chunk_start), val(chunk_end), val(target_name), val(ref_name), file("${base}_imputed.dose.vcf.gz"), file("${base}_imputed.info"), val(tagName) + tuple val(chrm), val(chunk_start), val(chunk_end), val(target_name), val(ref_name), file(imputed_vcf), file(indexed_vcf), file("${base}_imputed.info"), val(tagName) shell: base = "${file(target_phased_vcf.baseName).baseName}_${tagName}_${chrm}_${chunk_start}-${chunk_end}" + imputed_vcf = "${base}_imputed.dose.vcf.gz" + indexed_vcf = "${base}_imputed.dose.vcf.gz.tbi" """ minimac4 \ --refHaps ${ref_m3vcf} \ @@ -43,7 +45,8 @@ process impute_minimac4 { --minRatio ${params.minRatio} \ --chr ${chrm} --start ${chunk_start} --end ${chunk_end} --window ${params.buffer_size} \ --prefix ${base}_imputed \ - --cpus ${task.cpus} + --cpus ${task.cpus} + tabix ${imputed_vcf} """ } @@ -119,6 +122,7 @@ process impute_minimac4_1 { """ Combine impute chunks to chromosomes +bcftools +fill-tags -- -t AC,AN,AF,MAF | \ """ process combineImpute { tag "impComb_${target_name}_${ref_name}_${chrm}" @@ -126,14 +130,13 @@ process combineImpute { label "bigmem" input: - tuple val(target_name), val(ref_name), val(chrm), val(imputed_files) + tuple val(target_name), val(ref_name), val(chrm), file(imputed_files), file(indexed_files) output: tuple val(target_name), val(ref_name), val(chrm), file(comb_impute)// into combineImpute script: comb_impute = "${target_name}_${ref_name}_chr${chrm}.imputed.vcf.gz" """ - bcftools concat ${imputed_files.join(' ')} | \ - bcftools +fill-tags -- -t AC,AN,AF,MAF | \ + bcftools concat -a ${imputed_files.join(' ')} | \ bcftools sort -T . -Oz -o ${comb_impute} """ } diff --git a/modules/qc.nf b/modules/qc.nf index 8ad9f5e..867d372 100644 --- a/modules/qc.nf +++ b/modules/qc.nf @@ -37,6 +37,7 @@ def check_chromosome_vcf(dataset, dataset_vcf, map_file, chrms){ chromosomes_ = [:] chromosomes_['ALL'] = [] valid_chrms = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22'] + // valid_chrms = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] not_chrs = [] in_chrs = [] notValid_chrs = [] diff --git a/trypanogen.config b/trypanogen.config new file mode 100644 index 0000000..c095a1a --- /dev/null +++ b/trypanogen.config @@ -0,0 +1,115 @@ +params { + project_name = 'hlaimputation' + project_description = 'A simple imputation run on chrm 6 distributed with git repo' + outDir = "output/${project_name}" // outDir: where to put all pipeline's outputs + help = false + max_memory = 50.GB + max_cpus = 5 + max_time = 96.h + + // Reference panels + // Per chromosomes [name, m3vcf, vcf_or_bcf] + // Replace chromosome with %s for string interpolation to use multiple chromosomes + ref_panels = [ + //['Trypanogen',"/scratch3/users/nanje/chipimputation_evaluate_chips/results/trypanogen/m3vcfs/Eagle_%s.m3vcf.gz", + //"/scratch3/users/nanje/chipimputation_evaluate_chips/John/Eagle_%s.vcf.gz"], + ['Ugandan', "/scratch3/users/nanje/chipimputation_evaluate_chips/results/Ugandan/m3vcfs/Ugandan_%s_phased.m3vcf.gz", + "/scratch3/users/nanje/chipimputation_evaluate_chips/results/Ugandan/vcfs/Ugandan_%s_phased.vcf.gz"], + ] + combined_ref = [ + ['Ugandan', "/scratch3/users/nanje/chipimputation_evaluate_chips/results/Ugandan/vcfs/Ugandan_merged_sorted.vcf.gz"], + ] + + combined_imputed = [ + ["Ugandan", "John_target", "/scratch3/users/nanje/chipimputation/chipimputation/work/28/1a22e891a90d3c18473e7d56315e1b/John_target_Ugandan_chr.imputed.vcf.gz"], + ] + + // Study datasets + target_datasets = [ + // ["cleandatanops", "/scratch3/users/nanje/chipimputation_evaluate_chips/John/target/cleandatanops2-updated3.vcf.gz"], + ["John_target", "/scratch3/users/nanje/John/sample_1_22.vcf.gz"], + ] + + // Genetic map for eagle2 + eagle_genetic_map = "/cbio/dbs/refpanels/eagle/tables/genetic_map_hg19_withX.txt.gz" + + // Reference genome used for QC + reference_genome = "/cbio/dbs/gatk/2.8/b37/human_g1k_v37_decoy.fasta" + + // List chromosomes to be used. chr22 for b38. Use ALL or '' to use all available chromosome in the target dataset + chromosomes = '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22' + + // imputation parameters + NE = "20000" + impute_iter = "10" + impute_burnin = "2" // must be less than impute_burnin + impute_info_cutoff = "0.3" + chunk_size = "20000000" // in base + buffer_size = "1000000" // in base + + // QC parameters + site_miss = "0.05" + hwe = "0.00001" + mac = "1" + min_ac = '2' + min_alleles = '2' + max_alleles = '2' + + // Phasing method: shapeit (default) or eagle2 + phasing_method = "eagle" + + // Imputation method minimac4 (default) or IMPUTE2 + impute_method = "minimac4" + + // Minimac4 option + minRatio = '0.1' + + // Phasing method: eagle2 (default) or shapeit + phasing_method = "eagle" + + // Plink to use, sometimes it can be plink2 + plink="plink2" + +} + +timeline { + enabled = true + file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_timeline.html" +} +report { + enabled = true + file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_report.html" +} +trace { + enabled = true + file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_trace.txt" +} +dag { + enabled = true + file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_dag.png" +} + +process { +// Process-specific resource requirements + withLabel: 'medium' { + errorStrategy = { task.exitStatus in [143, 137, 255] ? 'retry' : 'ignore' } + memory = 10.GB + } + withLabel : 'bigmem' { + errorStrategy = { task.exitStatus in [143, 137, 255] ? 'retry' : 'ignore' } + memory = 70.GB + time = { 24.h * task.attempt } + } +} + +profiles{ + singularity { + singularity.runOptions = " -B ${HOME} " + } + slurm { + queueSize = 100 + } + test { + queueSize = 10 + } +}