h3abionet · nanjalaruth · Jan 4, 2022 · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022
diff --git a/README.md b/README.md
@@ -34,6 +34,11 @@ nextflow run h3abionet/chipimputation/main.nf -profile test,singularity
 
 Check for results in `./output`
 
+In case you are running the outdated version, run the code below prior to executing the pipeline.
+
+```
+nextflow pull h3abionet/chipimputation
+```
 
 ### Start running your own analysis
 

diff --git a/main.nf b/main.nf
@@ -54,6 +54,8 @@ workflow preprocess {
     take: datasets
 
     main:
+
+
         //// check if study genotype files exist
         target_datasets = []
         datasets.each { name, vcf ->
@@ -159,20 +161,23 @@ workflow report_by_ref{
     take: data
 
     main:
-        /// /// By Refecene panel
-        // combine by chrom, dataset, refpanel
+        // /// /// By Refecene panel
+        // // combine by chrom, dataset, refpanel
         imputeCombine_ref = data
-                .groupTuple( by:[1] )
-                .map{ datasets, refpanel, vcfs, imputed_vcfs, imputed_infos -> [ refpanel, datasets.join(','), '', imputed_infos.join(',') ] }
-        filter_info_by_target( imputeCombine_ref )
-
-        /// change to group_by_maf
+                .groupTuple( by:[1,0] )
+                // .map{ datasets, refpanel, vcfs, imputed_vcfs, imputed_infos -> [ refpanel, datasets.join(','), '', imputed_infos.join(',') ] }
+                .map{ datasets, refpanel, vcfs, imputed_vcfs, indexed_vcf, imputed_infos -> [ refpanel, datasets, '', imputed_infos ] }
+        // imputeCombine_ref.view()
+        combineInfo(imputeCombine_ref)
+        filter_info_by_target( combineInfo.out )
+
+        // /// change to group_by_maf
         report_well_imputed_by_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(wellInfo) ]} )
 
-        //// Plot performance all targets by maf for a reference panel
+        // //// Plot performance all targets by maf for a reference panel
         plot_performance_target( report_well_imputed_by_target.out.map{ target_name, ref_panels, wellInfo, wellInfo_summary -> [ target_name, ref_panels, file(wellInfo), file(wellInfo_summary), 'DATASETS' ]} )
 
-        //// Accuracy/Concordance
+        // //// Accuracy/Concordance
         report_accuracy_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(accInfo), 'DATASETS' ]} )
         plot_accuracy_target ( report_accuracy_target.out )
     emit:
@@ -183,29 +188,33 @@ workflow report_by_dataset{
     take: data
 
     main:
-        /// /// By Dataset
-        // combine by chrom, dataset, refpanel
+    //     /// /// By Dataset
+    //     // combine by chrom, dataset, refpanel
 
         imputeCombine_ref = data
-                .groupTuple( by:[0] )
-                .map{ dataset, refpanels, vcfs, imputed_vcfs, imputed_infos -> [ dataset, refpanels.join(','), '', imputed_infos.join(',') ] }
-        filter_info_by_target( imputeCombine_ref )
+                .groupTuple( by:[0,1] )
+        //         // .map{ dataset, refpanels, vcfs, imputed_vcfs, imputed_infos -> [ dataset, refpanels[0], '', imputed_infos.join(',') ] }
+                .map{ dataset, refpanels, vcfs, imputed_vcfs, indexed_vcfs, imputed_infos -> [ dataset, refpanels, '', imputed_infos ] }
+        // imputeCombine_ref.view()
+        combineInfo(imputeCombine_ref)
+        filter_info_by_target( combineInfo.out )
+
 
-        ///// Number of well imputed snps
-        /// change to group_by_maf
+    //     ///// Number of well imputed snps
+    //     // / change to group_by_maf
         report_well_imputed_by_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(wellInfo) ]} )
-        /// Plot performance all targets by maf for a reference panel
+    //     // / Plot performance all targets by maf for a reference panel
         plot_performance_target( report_well_imputed_by_target.out.map{ target_name, ref_panels, wellInfo, wellInfo_summary -> [ target_name, ref_panels, file(wellInfo), file(wellInfo_summary), 'REFERENCE_PANELS' ]} )
 
-        //// Accuracy/Concordance
+    //     //// Accuracy/Concordance
         report_accuracy_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(accInfo), 'REFERENCE_PANELS' ]} )
         plot_accuracy_target ( report_accuracy_target.out )
 
-        // Plot number of imputed SNPs over the mean r2 for all reference panels
-        input = imputeCombine_ref
+    //     // Plot number of imputed SNPs over the mean r2 for all reference panels
+        input = combineInfo.out 
         .map{ dataset, refpanels, chrm, infos -> [dataset, refpanels, infos]}
 
-        // Plot number of imputed SNPs over the mean r2 for all reference panels
+    //     // Plot number of imputed SNPs over the mean r2 for all reference panels
         plot_r2_SNPcount(input)
 
         // Plot histograms of number of imputed SNPs over the mean r2 for all reference panels
@@ -251,40 +260,70 @@ workflow {
     // Reporting
     // impute.out.chunks_imputed.view()
     impute_data = impute.out.chunks_imputed
-                .map{chr, fwd, rev, test_data, ref, imputed_vcf, 
-                imputed_info, tst_data -> [test_data, ref, imputed_vcf, imputed_info]}
+                .map{chr, fwd, rev, test_data, ref, imputed_vcf, indexed_vcf,
+                imputed_info, tst_data -> [test_data, ref, imputed_vcf, indexed_vcf, imputed_info]}
                 .combine(params.target_datasets, by:0)
-                .map {test_data, ref, imputed_vcf, imputed_info, orig_vcf 
-                -> [test_data, ref, orig_vcf, imputed_vcf, imputed_info]}
+                .map {test_data, ref, imputed_vcf, indexed_vcf, imputed_info, orig_vcf 
+                -> [test_data, ref, orig_vcf, imputed_vcf, indexed_vcf, imputed_info]}
+
 
     // //// Report by Reference
     report_by_ref( impute_data )
 
     // //// Report by datasets
     report_by_dataset( impute_data )
 
-    // // Generate dataset frequencies
-    inp = Channel.fromList(params.ref_panels).map{ref, m3vcf, vcf -> [ref, vcf]}
+    // Combine vcfs
+
     input = impute_data
-    .map{ target_name, ref_name, vcf, impute_vcf, info ->[  ref_name, target_name, file(impute_vcf)]}
+        .groupTuple( by:[0,1] )
+        .map{ target_name, ref_name, vcf, impute_vcf, indexed_vcf, info ->[ target_name, ref_name, '', impute_vcf, indexed_vcf]}
+    // input.view()
+    combineImpute(input)
+
+    // Combine Infos
+    imputeCombine_ref = impute_data
+            .groupTuple( by:[0,1] )
+            .map{ dataset, refpanels, vcfs, imputed_vcfs, indexed_vcfs, imputed_infos -> [ dataset, refpanels, '', imputed_infos ] }
+        // imputeCombine_ref.view()
+    combineInfo(imputeCombine_ref)
+
+
+    // // Generate dataset frequencies
+    inp = Channel.fromList(params.combined_ref)
+    imp_input = combineImpute.out
+    .map{target, ref, chrm, vcf -> [ref, target, vcf]}
     .combine(inp, by:0)
     .map{ ref_name, target_name, impute_vcf, ref_vcf -> [target_name, ref_name, file(impute_vcf), file(ref_vcf)]}
-    generate_frequency(input)
+    generate_frequency(imp_input)
 
     // // Plot frequency Comparison
-    freq_comp = impute_data.map {target_name, ref_name, vcf, impute_vcf, info -> 
-    [target_name, ref_name, info]}
-    .combine(generate_frequency.out, by:[0,1])
+    freq_comp = combineInfo.out
+        .map{target, ref, chrm, info -> [target, ref, info]}
+        .combine(generate_frequency.out, by:[0,1])
     plot_freq_comparison(freq_comp)
+
 
     // // Plot number of imputed SNPs over the mean r2 for all reference panels
-    combineInfo_frq = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info, params.maf_thresh]}
-    .combine(generate_frequency.out, by:[0,1])
-    .map { target_name, ref_name, info, maf_thresh, target_frq, ref_frq -> 
-    [target_name, ref_name, info, maf_thresh, target_frq]}
+    // combineInfo_frq = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info, params.maf_thresh]}
+    // .combine(generate_frequency.out, by:[0,1])
+    // .map { target_name, ref_name, info, maf_thresh, target_frq, ref_frq -> 
+    // [target_name, ref_name, info, maf_thresh, target_frq]}
+    // plot_r2_SNPpos(combineInfo_frq)
+
+    combineInfo_frq = combineInfo.out
+        .map{target, ref, chrm, info -> [target, ref, info, params.maf_thresh]}
+        .combine(generate_frequency.out, by:[0,1])
+        .map { target_name, ref_name, info, maf_thresh, target_frq, ref_frq -> 
+        [target_name, ref_name, info, maf_thresh, target_frq]}
+    // combineInfo_frq.view()
     plot_r2_SNPpos(combineInfo_frq)
 
     // // compute for average rsquared values
-    rsquared_input = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info]}
+    // rsquared_input = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info]}
+    // average_r2(rsquared_input)
+    rsquared_input = combineInfo.out
+        .map{target, ref, chrm, info -> [target, ref, info]}
     average_r2(rsquared_input)
+
 }
diff --git a/modules/impute.nf b/modules/impute.nf
@@ -31,9 +31,11 @@ process impute_minimac4 {
     input:
         tuple val(chrm), val(chunk_start), val(chunk_end), val(target_name), file(target_phased_vcf), file(target_phased_vcf_tbi), val(ref_name), file(ref_vcf), file(ref_m3vcf), val(tagName)
     output:
-        tuple val(chrm), val(chunk_start), val(chunk_end), val(target_name), val(ref_name), file("${base}_imputed.dose.vcf.gz"), file("${base}_imputed.info"), val(tagName)
+        tuple val(chrm), val(chunk_start), val(chunk_end), val(target_name), val(ref_name), file(imputed_vcf), file(indexed_vcf), file("${base}_imputed.info"), val(tagName)
     shell:
         base = "${file(target_phased_vcf.baseName).baseName}_${tagName}_${chrm}_${chunk_start}-${chunk_end}"
+        imputed_vcf = "${base}_imputed.dose.vcf.gz"
+        indexed_vcf = "${base}_imputed.dose.vcf.gz.tbi"
         """
         minimac4 \
             --refHaps ${ref_m3vcf} \
@@ -43,7 +45,8 @@ process impute_minimac4 {
             --minRatio ${params.minRatio} \
             --chr ${chrm} --start ${chunk_start} --end ${chunk_end} --window ${params.buffer_size} \
             --prefix ${base}_imputed \
-            --cpus ${task.cpus}
+            --cpus ${task.cpus} 
+        tabix ${imputed_vcf}
         """
 }
 
@@ -119,21 +122,21 @@ process impute_minimac4_1 {
 
 """
 Combine impute chunks to chromosomes
+bcftools +fill-tags -- -t AC,AN,AF,MAF | \
 """
 process combineImpute {
     tag "impComb_${target_name}_${ref_name}_${chrm}"
     publishDir "${params.outDir}/imputed/${ref_name}/${target_name}", overwrite: true, mode:'copy', pattern: '*imputed.vcf.gz'
     label "bigmem"
 
     input:
-        tuple val(target_name), val(ref_name), val(chrm), val(imputed_files)
+        tuple val(target_name), val(ref_name), val(chrm), file(imputed_files), file(indexed_files)
     output:
         tuple val(target_name), val(ref_name), val(chrm), file(comb_impute)// into combineImpute
     script:
         comb_impute = "${target_name}_${ref_name}_chr${chrm}.imputed.vcf.gz"
         """
-        bcftools concat ${imputed_files.join(' ')} | \
-        bcftools +fill-tags -- -t AC,AN,AF,MAF | \
+        bcftools concat -a ${imputed_files.join(' ')} | \
         bcftools sort -T . -Oz -o ${comb_impute}
         """
 }

diff --git a/modules/qc.nf b/modules/qc.nf
@@ -37,6 +37,7 @@ def check_chromosome_vcf(dataset, dataset_vcf, map_file, chrms){
     chromosomes_ = [:]
     chromosomes_['ALL'] = []
     valid_chrms = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22']
+    // valid_chrms = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
     not_chrs = []
     in_chrs = []
     notValid_chrs = []

diff --git a/trypanogen.config b/trypanogen.config
@@ -0,0 +1,115 @@
+params {
+  project_name            = 'hlaimputation'
+  project_description     = 'A simple imputation run on chrm 6 distributed with git repo'
+  outDir                  = "output/${project_name}" // outDir: where to put all pipeline's outputs
+  help                    = false
+  max_memory = 50.GB
+  max_cpus = 5
+  max_time = 96.h
+
+  // Reference panels
+  // Per chromosomes [name, m3vcf, vcf_or_bcf]
+  // Replace chromosome with %s for string interpolation to use multiple chromosomes
+  ref_panels = [
+     //['Trypanogen',"/scratch3/users/nanje/chipimputation_evaluate_chips/results/trypanogen/m3vcfs/Eagle_%s.m3vcf.gz",
+      //"/scratch3/users/nanje/chipimputation_evaluate_chips/John/Eagle_%s.vcf.gz"], 
+      ['Ugandan', "/scratch3/users/nanje/chipimputation_evaluate_chips/results/Ugandan/m3vcfs/Ugandan_%s_phased.m3vcf.gz",
+      "/scratch3/users/nanje/chipimputation_evaluate_chips/results/Ugandan/vcfs/Ugandan_%s_phased.vcf.gz"],
+  ]
+  combined_ref = [
+      ['Ugandan', "/scratch3/users/nanje/chipimputation_evaluate_chips/results/Ugandan/vcfs/Ugandan_merged_sorted.vcf.gz"],
+  ]
+
+  combined_imputed = [
+      ["Ugandan", "John_target", "/scratch3/users/nanje/chipimputation/chipimputation/work/28/1a22e891a90d3c18473e7d56315e1b/John_target_Ugandan_chr.imputed.vcf.gz"],
+  ]
+
+  // Study datasets
+  target_datasets = [
+    // ["cleandatanops", "/scratch3/users/nanje/chipimputation_evaluate_chips/John/target/cleandatanops2-updated3.vcf.gz"],
+     ["John_target", "/scratch3/users/nanje/John/sample_1_22.vcf.gz"],
+  ]
+
+  // Genetic map for eagle2
+  eagle_genetic_map = "/cbio/dbs/refpanels/eagle/tables/genetic_map_hg19_withX.txt.gz"
+
+  // Reference genome used for QC
+  reference_genome  = "/cbio/dbs/gatk/2.8/b37/human_g1k_v37_decoy.fasta"
+
+  // List chromosomes to be used. chr22 for b38. Use ALL or '' to use all available chromosome in the target dataset
+  chromosomes = '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22'
+
+  // imputation parameters
+  NE = "20000"
+  impute_iter = "10"
+  impute_burnin = "2" // must be less than impute_burnin
+  impute_info_cutoff = "0.3"
+  chunk_size = "20000000" // in base
+  buffer_size = "1000000" // in base
+
+  // QC parameters
+  site_miss                = "0.05"
+  hwe			  = "0.00001"
+  mac			  = "1"
+  min_ac = '2'
+  min_alleles = '2'
+  max_alleles = '2'
+
+  // Phasing method: shapeit (default) or eagle2
+  phasing_method          = "eagle"
+
+  // Imputation method minimac4 (default) or IMPUTE2
+  impute_method           = "minimac4"
+
+  // Minimac4 option
+  minRatio = '0.1'
+
+  // Phasing method: eagle2 (default) or shapeit
+  phasing_method          = "eagle"
+
+  // Plink to use, sometimes it can be plink2
+  plink="plink2"
+
+}
+
+timeline {
+  enabled = true
+  file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_timeline.html"
+}
+report {
+  enabled = true
+  file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_report.html"
+}
+trace {
+  enabled = true
+  file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_trace.txt"
+}
+dag {
+  enabled = true
+  file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_dag.png"
+}
+
+process {
+// Process-specific resource requirements
+  withLabel: 'medium' {
+    errorStrategy = { task.exitStatus in [143, 137, 255] ? 'retry' : 'ignore' }
+    memory = 10.GB
+  }
+  withLabel : 'bigmem' {
+    errorStrategy = { task.exitStatus in [143, 137, 255] ? 'retry' : 'ignore' }
+    memory = 70.GB
+    time = { 24.h * task.attempt }
+  }
+}
+
+profiles{
+  singularity {
+    singularity.runOptions = " -B ${HOME} "
+  }
+  slurm {
+    queueSize = 100
+  }
+  test {
+    queueSize = 10
+  }
+}