Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ nextflow run h3abionet/chipimputation/main.nf -profile test,singularity

Check for results in `./output`

In case you are running the outdated version, run the code below prior to executing the pipeline.

```
nextflow pull h3abionet/chipimputation
```

### Start running your own analysis

Expand Down
113 changes: 76 additions & 37 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ workflow preprocess {
take: datasets

main:


//// check if study genotype files exist
target_datasets = []
datasets.each { name, vcf ->
Expand Down Expand Up @@ -159,20 +161,23 @@ workflow report_by_ref{
take: data

main:
/// /// By Refecene panel
// combine by chrom, dataset, refpanel
// /// /// By Refecene panel
// // combine by chrom, dataset, refpanel
imputeCombine_ref = data
.groupTuple( by:[1] )
.map{ datasets, refpanel, vcfs, imputed_vcfs, imputed_infos -> [ refpanel, datasets.join(','), '', imputed_infos.join(',') ] }
filter_info_by_target( imputeCombine_ref )

/// change to group_by_maf
.groupTuple( by:[1,0] )
// .map{ datasets, refpanel, vcfs, imputed_vcfs, imputed_infos -> [ refpanel, datasets.join(','), '', imputed_infos.join(',') ] }
.map{ datasets, refpanel, vcfs, imputed_vcfs, indexed_vcf, imputed_infos -> [ refpanel, datasets, '', imputed_infos ] }
// imputeCombine_ref.view()
combineInfo(imputeCombine_ref)
filter_info_by_target( combineInfo.out )

// /// change to group_by_maf
report_well_imputed_by_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(wellInfo) ]} )

//// Plot performance all targets by maf for a reference panel
// //// Plot performance all targets by maf for a reference panel
plot_performance_target( report_well_imputed_by_target.out.map{ target_name, ref_panels, wellInfo, wellInfo_summary -> [ target_name, ref_panels, file(wellInfo), file(wellInfo_summary), 'DATASETS' ]} )

//// Accuracy/Concordance
// //// Accuracy/Concordance
report_accuracy_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(accInfo), 'DATASETS' ]} )
plot_accuracy_target ( report_accuracy_target.out )
emit:
Expand All @@ -183,29 +188,33 @@ workflow report_by_dataset{
take: data

main:
/// /// By Dataset
// combine by chrom, dataset, refpanel
// /// /// By Dataset
// // combine by chrom, dataset, refpanel

imputeCombine_ref = data
.groupTuple( by:[0] )
.map{ dataset, refpanels, vcfs, imputed_vcfs, imputed_infos -> [ dataset, refpanels.join(','), '', imputed_infos.join(',') ] }
filter_info_by_target( imputeCombine_ref )
.groupTuple( by:[0,1] )
// // .map{ dataset, refpanels, vcfs, imputed_vcfs, imputed_infos -> [ dataset, refpanels[0], '', imputed_infos.join(',') ] }
.map{ dataset, refpanels, vcfs, imputed_vcfs, indexed_vcfs, imputed_infos -> [ dataset, refpanels, '', imputed_infos ] }
// imputeCombine_ref.view()
combineInfo(imputeCombine_ref)
filter_info_by_target( combineInfo.out )


///// Number of well imputed snps
/// change to group_by_maf
// ///// Number of well imputed snps
// // / change to group_by_maf
report_well_imputed_by_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(wellInfo) ]} )
/// Plot performance all targets by maf for a reference panel
// // / Plot performance all targets by maf for a reference panel
plot_performance_target( report_well_imputed_by_target.out.map{ target_name, ref_panels, wellInfo, wellInfo_summary -> [ target_name, ref_panels, file(wellInfo), file(wellInfo_summary), 'REFERENCE_PANELS' ]} )

//// Accuracy/Concordance
// //// Accuracy/Concordance
report_accuracy_target( filter_info_by_target.out.map{ target_name, ref_panels, wellInfo, accInfo -> [ target_name, ref_panels, file(accInfo), 'REFERENCE_PANELS' ]} )
plot_accuracy_target ( report_accuracy_target.out )

// Plot number of imputed SNPs over the mean r2 for all reference panels
input = imputeCombine_ref
// // Plot number of imputed SNPs over the mean r2 for all reference panels
input = combineInfo.out
.map{ dataset, refpanels, chrm, infos -> [dataset, refpanels, infos]}

// Plot number of imputed SNPs over the mean r2 for all reference panels
// // Plot number of imputed SNPs over the mean r2 for all reference panels
plot_r2_SNPcount(input)

// Plot histograms of number of imputed SNPs over the mean r2 for all reference panels
Expand Down Expand Up @@ -251,40 +260,70 @@ workflow {
// Reporting
// impute.out.chunks_imputed.view()
impute_data = impute.out.chunks_imputed
.map{chr, fwd, rev, test_data, ref, imputed_vcf,
imputed_info, tst_data -> [test_data, ref, imputed_vcf, imputed_info]}
.map{chr, fwd, rev, test_data, ref, imputed_vcf, indexed_vcf,
imputed_info, tst_data -> [test_data, ref, imputed_vcf, indexed_vcf, imputed_info]}
.combine(params.target_datasets, by:0)
.map {test_data, ref, imputed_vcf, imputed_info, orig_vcf
-> [test_data, ref, orig_vcf, imputed_vcf, imputed_info]}
.map {test_data, ref, imputed_vcf, indexed_vcf, imputed_info, orig_vcf
-> [test_data, ref, orig_vcf, imputed_vcf, indexed_vcf, imputed_info]}


// //// Report by Reference
report_by_ref( impute_data )

// //// Report by datasets
report_by_dataset( impute_data )

// // Generate dataset frequencies
inp = Channel.fromList(params.ref_panels).map{ref, m3vcf, vcf -> [ref, vcf]}
// Combine vcfs

input = impute_data
.map{ target_name, ref_name, vcf, impute_vcf, info ->[ ref_name, target_name, file(impute_vcf)]}
.groupTuple( by:[0,1] )
.map{ target_name, ref_name, vcf, impute_vcf, indexed_vcf, info ->[ target_name, ref_name, '', impute_vcf, indexed_vcf]}
// input.view()
combineImpute(input)

// Combine Infos
imputeCombine_ref = impute_data
.groupTuple( by:[0,1] )
.map{ dataset, refpanels, vcfs, imputed_vcfs, indexed_vcfs, imputed_infos -> [ dataset, refpanels, '', imputed_infos ] }
// imputeCombine_ref.view()
combineInfo(imputeCombine_ref)


// // Generate dataset frequencies
inp = Channel.fromList(params.combined_ref)
imp_input = combineImpute.out
.map{target, ref, chrm, vcf -> [ref, target, vcf]}
.combine(inp, by:0)
.map{ ref_name, target_name, impute_vcf, ref_vcf -> [target_name, ref_name, file(impute_vcf), file(ref_vcf)]}
generate_frequency(input)
generate_frequency(imp_input)

// // Plot frequency Comparison
freq_comp = impute_data.map {target_name, ref_name, vcf, impute_vcf, info ->
[target_name, ref_name, info]}
.combine(generate_frequency.out, by:[0,1])
freq_comp = combineInfo.out
.map{target, ref, chrm, info -> [target, ref, info]}
.combine(generate_frequency.out, by:[0,1])
plot_freq_comparison(freq_comp)


// // Plot number of imputed SNPs over the mean r2 for all reference panels
combineInfo_frq = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info, params.maf_thresh]}
.combine(generate_frequency.out, by:[0,1])
.map { target_name, ref_name, info, maf_thresh, target_frq, ref_frq ->
[target_name, ref_name, info, maf_thresh, target_frq]}
// combineInfo_frq = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info, params.maf_thresh]}
// .combine(generate_frequency.out, by:[0,1])
// .map { target_name, ref_name, info, maf_thresh, target_frq, ref_frq ->
// [target_name, ref_name, info, maf_thresh, target_frq]}
// plot_r2_SNPpos(combineInfo_frq)

combineInfo_frq = combineInfo.out
.map{target, ref, chrm, info -> [target, ref, info, params.maf_thresh]}
.combine(generate_frequency.out, by:[0,1])
.map { target_name, ref_name, info, maf_thresh, target_frq, ref_frq ->
[target_name, ref_name, info, maf_thresh, target_frq]}
// combineInfo_frq.view()
plot_r2_SNPpos(combineInfo_frq)

// // compute for average rsquared values
rsquared_input = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info]}
// rsquared_input = impute_data.map{ target_name, ref_name, vcf, impute_vcf, info ->[ target_name, ref_name, info]}
// average_r2(rsquared_input)
rsquared_input = combineInfo.out
.map{target, ref, chrm, info -> [target, ref, info]}
average_r2(rsquared_input)

}
13 changes: 8 additions & 5 deletions modules/impute.nf
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ process impute_minimac4 {
input:
tuple val(chrm), val(chunk_start), val(chunk_end), val(target_name), file(target_phased_vcf), file(target_phased_vcf_tbi), val(ref_name), file(ref_vcf), file(ref_m3vcf), val(tagName)
output:
tuple val(chrm), val(chunk_start), val(chunk_end), val(target_name), val(ref_name), file("${base}_imputed.dose.vcf.gz"), file("${base}_imputed.info"), val(tagName)
tuple val(chrm), val(chunk_start), val(chunk_end), val(target_name), val(ref_name), file(imputed_vcf), file(indexed_vcf), file("${base}_imputed.info"), val(tagName)
shell:
base = "${file(target_phased_vcf.baseName).baseName}_${tagName}_${chrm}_${chunk_start}-${chunk_end}"
imputed_vcf = "${base}_imputed.dose.vcf.gz"
indexed_vcf = "${base}_imputed.dose.vcf.gz.tbi"
"""
minimac4 \
--refHaps ${ref_m3vcf} \
Expand All @@ -43,7 +45,8 @@ process impute_minimac4 {
--minRatio ${params.minRatio} \
--chr ${chrm} --start ${chunk_start} --end ${chunk_end} --window ${params.buffer_size} \
--prefix ${base}_imputed \
--cpus ${task.cpus}
--cpus ${task.cpus}
tabix ${imputed_vcf}
"""
}

Expand Down Expand Up @@ -119,21 +122,21 @@ process impute_minimac4_1 {

"""
Combine impute chunks to chromosomes
bcftools +fill-tags -- -t AC,AN,AF,MAF | \
"""
process combineImpute {
tag "impComb_${target_name}_${ref_name}_${chrm}"
publishDir "${params.outDir}/imputed/${ref_name}/${target_name}", overwrite: true, mode:'copy', pattern: '*imputed.vcf.gz'
label "bigmem"

input:
tuple val(target_name), val(ref_name), val(chrm), val(imputed_files)
tuple val(target_name), val(ref_name), val(chrm), file(imputed_files), file(indexed_files)
output:
tuple val(target_name), val(ref_name), val(chrm), file(comb_impute)// into combineImpute
script:
comb_impute = "${target_name}_${ref_name}_chr${chrm}.imputed.vcf.gz"
"""
bcftools concat ${imputed_files.join(' ')} | \
bcftools +fill-tags -- -t AC,AN,AF,MAF | \
bcftools concat -a ${imputed_files.join(' ')} | \
bcftools sort -T . -Oz -o ${comb_impute}
"""
}
Expand Down
1 change: 1 addition & 0 deletions modules/qc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def check_chromosome_vcf(dataset, dataset_vcf, map_file, chrms){
chromosomes_ = [:]
chromosomes_['ALL'] = []
valid_chrms = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22']
// valid_chrms = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
not_chrs = []
in_chrs = []
notValid_chrs = []
Expand Down
115 changes: 115 additions & 0 deletions trypanogen.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
params {
project_name = 'hlaimputation'
project_description = 'A simple imputation run on chrm 6 distributed with git repo'
outDir = "output/${project_name}" // outDir: where to put all pipeline's outputs
help = false
max_memory = 50.GB
max_cpus = 5
max_time = 96.h

// Reference panels
// Per chromosomes [name, m3vcf, vcf_or_bcf]
// Replace chromosome with %s for string interpolation to use multiple chromosomes
ref_panels = [
//['Trypanogen',"/scratch3/users/nanje/chipimputation_evaluate_chips/results/trypanogen/m3vcfs/Eagle_%s.m3vcf.gz",
//"/scratch3/users/nanje/chipimputation_evaluate_chips/John/Eagle_%s.vcf.gz"],
['Ugandan', "/scratch3/users/nanje/chipimputation_evaluate_chips/results/Ugandan/m3vcfs/Ugandan_%s_phased.m3vcf.gz",
"/scratch3/users/nanje/chipimputation_evaluate_chips/results/Ugandan/vcfs/Ugandan_%s_phased.vcf.gz"],
]
combined_ref = [
['Ugandan', "/scratch3/users/nanje/chipimputation_evaluate_chips/results/Ugandan/vcfs/Ugandan_merged_sorted.vcf.gz"],
]

combined_imputed = [
["Ugandan", "John_target", "/scratch3/users/nanje/chipimputation/chipimputation/work/28/1a22e891a90d3c18473e7d56315e1b/John_target_Ugandan_chr.imputed.vcf.gz"],
]

// Study datasets
target_datasets = [
// ["cleandatanops", "/scratch3/users/nanje/chipimputation_evaluate_chips/John/target/cleandatanops2-updated3.vcf.gz"],
["John_target", "/scratch3/users/nanje/John/sample_1_22.vcf.gz"],
]

// Genetic map for eagle2
eagle_genetic_map = "/cbio/dbs/refpanels/eagle/tables/genetic_map_hg19_withX.txt.gz"

// Reference genome used for QC
reference_genome = "/cbio/dbs/gatk/2.8/b37/human_g1k_v37_decoy.fasta"

// List chromosomes to be used. chr22 for b38. Use ALL or '' to use all available chromosome in the target dataset
chromosomes = '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22'

// imputation parameters
NE = "20000"
impute_iter = "10"
impute_burnin = "2" // must be less than impute_burnin
impute_info_cutoff = "0.3"
chunk_size = "20000000" // in base
buffer_size = "1000000" // in base

// QC parameters
site_miss = "0.05"
hwe = "0.00001"
mac = "1"
min_ac = '2'
min_alleles = '2'
max_alleles = '2'

// Phasing method: shapeit (default) or eagle2
phasing_method = "eagle"

// Imputation method minimac4 (default) or IMPUTE2
impute_method = "minimac4"

// Minimac4 option
minRatio = '0.1'

// Phasing method: eagle2 (default) or shapeit
phasing_method = "eagle"

// Plink to use, sometimes it can be plink2
plink="plink2"

}

timeline {
enabled = true
file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_timeline.html"
}
report {
enabled = true
file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_report.html"
}
trace {
enabled = true
file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_trace.txt"
}
dag {
enabled = true
file = "${params.outDir}/nextflow_reports/${params.project_name}_h3achipimputation_dag.png"
}

process {
// Process-specific resource requirements
withLabel: 'medium' {
errorStrategy = { task.exitStatus in [143, 137, 255] ? 'retry' : 'ignore' }
memory = 10.GB
}
withLabel : 'bigmem' {
errorStrategy = { task.exitStatus in [143, 137, 255] ? 'retry' : 'ignore' }
memory = 70.GB
time = { 24.h * task.attempt }
}
}

profiles{
singularity {
singularity.runOptions = " -B ${HOME} "
}
slurm {
queueSize = 100
}
test {
queueSize = 10
}
}