diff --git a/workflow/main.nf b/workflow/main.nf index 679a00691bf1b930c00ef44cbdcd623e9f3644a2..d7b7829dcd1bed9de3798052ff59895c24db61f0 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,126 +1,31 @@ #!/usr/bin/env nextflow // Default parameter values to run tests -params.bams="$baseDir/../test/*.bam" +// params.bams="$baseDir/../test/*.bam" params.design="$baseDir/../test/samplesheet.csv" -#params.genome="/project/shared/bicf_workflow_ref/GRCh37/" +// params.genome="/project/shared/bicf_workflow_ref/GRCh37/" -design_file = file(params.design) -bams=file(params.bams) -#gtf_file = file("$params.genome/gencode.gtf") -#genenames = file("$params.genome/genenames.txt") -#geneset = file("$params.genome/gsea_gmt/$params.geneset") +// design_file = file(params.design) +// bams=file(params.bams) +//gtf_file = file("$params.genome/gencode.gtf") +//genenames = file("$params.genome/genenames.txt") +//geneset = file("$params.genome/gsea_gmt/$params.geneset") -// Pair handling, helper function taken from rnatoy -// which is covered by the GNU General Public License v3 -// https://github.com/nextflow-io/rnatoy/blob/master/main.nf - -def fileMap = [:] - -fastqs.each { - final fileName = it.getFileName().toString() - prefix = fileName.lastIndexOf('/') - fileMap[fileName] = it -} -def prefix = [] -new File(params.design).withReader { reader -> - def hline = reader.readLine() - def header = hline.split(",") - prefixidx = header.findIndexOf{it == 'Condition'}; - peakidx = header.findIndexOf{it == 'Peaks'}; - bamipidx = header.findIndexOf{it == 'bamReads'}; - bamctrlidx = header.findIndexOf{it == 'bamControl'}; - if (bamctrlidx == -1) { - error "Must provide control BAM file" - } - if (peakidx == -1) { - error "Must provide peak file" - } - while (line = reader.readLine()) { - def row = line.split(",") - if (fileMap.get(row[bamipidx]) != null) { - prefix << tuple(row[prefixidx],fileMap.get(row[peakidx]),fileMap.get(row[bamipidx]),fileMap.get(row[bamctrlidx])) - } - -} -} -if( ! prefix) { error "Didn't match any input files with entries in the design file" } - -if (params.pairs == 'pe') { -Channel - .from(prefix) - .set { read_pe } -Channel - .empty() - .set { read_se } -} -if (params.pairs == 'se') { -Channel - .from(prefix) - .into { read_se } -Channel - .empty() - .set { read_pe } -} - process peakanno { - publishDir "$baseDir/output", mode: 'copy' - input: - file peak_file from greencenter - file design_file from input - file annotation Tdx - output: - set peak_id, file("${pattern}_annotation.xls"), file("${pattern}_peakTssDistribution.png") into peakanno - """ - module load R/3.2.1-intel - Rscript $baseDir/scripts/runchipseeker.R - """ -} - -//Run deeptools for QC and other plots -//Since this problem also need all input files, need to build another channel? -process deeptools { - publishDir "$baseDir/output", mode: 'copy' - input: - file peak_file from input - file bam_file from imput +// publishDir "$baseDir/output", mode: 'copy' +// input: +// file design_file from input +// file annotation Tdx output: - set - + stdout result +// set peak_id, file("${pattern}_annotation.xls"), file("${pattern}_peakTssDistribution.png") into peakanno + script: + """ + module load R/3.2.1-intel + python $baseDir/scripts/process.py + #Rscript /project/BICF/BICF_Core/bchen4/chipseq_analysis/workflow/scripts/runchipseeker.R +""" } -//Need to do it with more than 1 condition -process diffbind { - publishDir "$baseDir/output", mode: 'copy' - input: - file peak_file from input - file design_file name 'design.txt' - file bam_file from input - output: - file "*.txt" into txtfiles - file "*.png" into psfiles - file "*" - """ - module load R/3.2.1-intel - #cp design.txt design.shiny.txt - #cp geneset.gmt geneset.shiny.gmt - Rscript $baseDir/scripts/runDiffBind.R - """ -} - -process buildrda { - publishDir "$baseDir/output", mode: 'copy' - input: - file stringtie_dir from strcts.toList() - file design_file name 'design.txt' - output: - file "bg.rda" into rdafiles - """ - module load R/3.2.1-intel - Rscript $baseDir/scripts/build_ballgown.R *_stringtie - """ - } - - diff --git a/workflow/scripts/runDeepTools.py b/workflow/scripts/runDeepTools.py new file mode 100644 index 0000000000000000000000000000000000000000..091b59f8ef5dafb861cf8f04ba5bab901127fb3f --- /dev/null +++ b/workflow/scripts/runDeepTools.py @@ -0,0 +1,77 @@ +#!/usr/bin/python +# programmer : bbc +# usage: + +import sys +import argparse as ap +import logging +import subprocess +import pandas as pd +from multiprocessing import Pool +logging.basicConfig(level=10) + + +def prepare_argparser(): + description = "Make wig file for given bed using bam" + epilog = "For command line options of each command, type %(prog)% COMMAND -h" + argparser = ap.ArgumentParser(description=description, epilog = epilog) + argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file") + argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19") + #argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format") + #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"]) + #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header") + return(argparser) + +def run_qc(files, controls, labels): + mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz" + #p = subprocess.Popen(mbs_command, shell=True) + #logging.debug(mbs_command) + #p.communicate() + pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers --outFileCorMatrix experiment.deeptools.spearmanCorr_readCounts.tab -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels) + #logging.debug(pcor_command) + #p = subprocess.Popen(pcor_command, shell=True) + #p.communicate() + #plotCoverage + pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10" + p = subprocess.Popen(pcov_command, shell=True) + p.communicate() + #draw fingerprints plots + #for treat,ctrl,name in zip(files,controls,labels): + # fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png" + # p = subprocess.Popen(fp_command, shell=True) + # p.communicate() + +def bam2bw_wrapper(command): + p = subprocess.Popen(command, shell=True) + p.communicate() + +def run_signal(files, labels, genome): + #compute matrix and draw profile and heatmap + gene_bed = "/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed" + bw_commands = [] + for f in files: + bw_commands.append("bamCoverage -b "+f+" -o "+f.replace("bam","bw")) + work_pool = Pool(min(len(files), 12)) + work_pool.map(bam2bw_wrapper, bw_commands) + work_pool.close() + work_pool.join() + + #cm_command = "computeMatrix scale-regions " + +def run(dfile,genome): + #Get annotation address + tss_bed = "/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/tss.bed" + #parse dfile, suppose data files are the same folder as design file + dfile = pd.read_csv(dfile) + #QC: multiBamSummary and plotCorrelation + run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID']) + + +def main(): + argparser = prepare_argparser() + args = argparser.parse_args() + run(args.infile, args.genome) + + +if __name__=="__main__": + main()