From a6ac0e3fe4ba9975abc60bf1c17be493f29adb0b Mon Sep 17 00:00:00 2001
From: Beibei Chen <beibei.chen@utsouthwestern.edu>
Date: Tue, 27 Dec 2016 15:40:34 -0600
Subject: [PATCH] deeptools qc works

---
 workflow/main.nf                 | 133 +++++--------------------------
 workflow/scripts/runDeepTools.py |  77 ++++++++++++++++++
 2 files changed, 96 insertions(+), 114 deletions(-)
 create mode 100644 workflow/scripts/runDeepTools.py

diff --git a/workflow/main.nf b/workflow/main.nf
index 679a006..d7b7829 100644
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -1,126 +1,31 @@
 #!/usr/bin/env nextflow
 	
 // Default parameter values to run tests
-params.bams="$baseDir/../test/*.bam"
+// params.bams="$baseDir/../test/*.bam"
 params.design="$baseDir/../test/samplesheet.csv"
-#params.genome="/project/shared/bicf_workflow_ref/GRCh37/"
+// params.genome="/project/shared/bicf_workflow_ref/GRCh37/"
 
-design_file = file(params.design)
-bams=file(params.bams)
-#gtf_file = file("$params.genome/gencode.gtf")
-#genenames = file("$params.genome/genenames.txt")
-#geneset = file("$params.genome/gsea_gmt/$params.geneset")
+// design_file = file(params.design)
+// bams=file(params.bams)
+//gtf_file = file("$params.genome/gencode.gtf")
+//genenames = file("$params.genome/genenames.txt")
+//geneset = file("$params.genome/gsea_gmt/$params.geneset")
 
 
-// Pair handling, helper function taken from rnatoy
-// which is covered by the GNU General Public License v3
-// https://github.com/nextflow-io/rnatoy/blob/master/main.nf
-
-def fileMap = [:]
-
-fastqs.each {
-    final fileName = it.getFileName().toString()
-    prefix = fileName.lastIndexOf('/')
-    fileMap[fileName] = it
-}
-def prefix = []
-new File(params.design).withReader { reader ->
-    def hline = reader.readLine()
-    def header = hline.split(",")
-    prefixidx = header.findIndexOf{it == 'Condition'};
-    peakidx = header.findIndexOf{it == 'Peaks'};
-    bamipidx = header.findIndexOf{it == 'bamReads'};
-    bamctrlidx = header.findIndexOf{it == 'bamControl'};
-    if (bamctrlidx == -1) {
-       error "Must provide control BAM file"
-       }      
-    if (peakidx == -1) {
-       error "Must provide peak file"
-       }
-    while (line = reader.readLine()) {
-    	   def row = line.split(",")
-	   if (fileMap.get(row[bamipidx]) != null) {
-	      prefix << tuple(row[prefixidx],fileMap.get(row[peakidx]),fileMap.get(row[bamipidx]),fileMap.get(row[bamctrlidx]))
-	   }
-	  
-} 
-}
-if( ! prefix) { error "Didn't match any input files with entries in the design file" }
-
-if (params.pairs == 'pe') {
-Channel
-  .from(prefix)
-  .set { read_pe }
-Channel
-  .empty()
-  .set { read_se } 
-}
-if (params.pairs == 'se') {
-Channel
-  .from(prefix)
-  .into { read_se }
-Channel
-  .empty()
-  .set { read_pe }
-}
-
 
 process peakanno {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-   file peak_file from greencenter
-   file design_file from input
-   file annotation Tdx
-   output:
-   set peak_id, file("${pattern}_annotation.xls"), file("${pattern}_peakTssDistribution.png") into peakanno
-   """
-   module load R/3.2.1-intel
-   Rscript $baseDir/scripts/runchipseeker.R
-   """
-}
-
-//Run deeptools for QC and other plots
-//Since this problem also need all input files, need to build another channel?
-process deeptools {
-   publishDir "$baseDir/output", mode: 'copy'
-   input: 
-   file peak_file from input
-   file bam_file from imput
+//   publishDir "$baseDir/output", mode: 'copy'
+//   input:
+//   file design_file from input
+//   file annotation Tdx
    output:
-   set
-
+     stdout result
+//   set peak_id, file("${pattern}_annotation.xls"), file("${pattern}_peakTssDistribution.png") into peakanno
+     script:
+     """
+     module load R/3.2.1-intel
+     python $baseDir/scripts/process.py
+     #Rscript /project/BICF/BICF_Core/bchen4/chipseq_analysis/workflow/scripts/runchipseeker.R     
+"""
 }
 
-//Need to do it with more than 1 condition
-process diffbind {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-   file peak_file from input
-   file design_file name 'design.txt'
-   file bam_file from input
-   output:
-   file "*.txt" into txtfiles
-   file "*.png" into psfiles
-   file "*"
-   """
-   module load R/3.2.1-intel
-   #cp design.txt design.shiny.txt
-   #cp geneset.gmt geneset.shiny.gmt
-   Rscript  $baseDir/scripts/runDiffBind.R
- """
-}
-
-process buildrda {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-   file stringtie_dir from strcts.toList()
-   file design_file name 'design.txt'
-   output:
-   file "bg.rda" into rdafiles
-   """
-   module load R/3.2.1-intel
-   Rscript $baseDir/scripts/build_ballgown.R *_stringtie
-   """
- }
-
-
diff --git a/workflow/scripts/runDeepTools.py b/workflow/scripts/runDeepTools.py
new file mode 100644
index 0000000..091b59f
--- /dev/null
+++ b/workflow/scripts/runDeepTools.py
@@ -0,0 +1,77 @@
+#!/usr/bin/python
+# programmer : bbc
+# usage:
+
+import sys
+import argparse as ap
+import logging
+import subprocess
+import pandas as pd
+from multiprocessing import Pool
+logging.basicConfig(level=10)
+
+
+def prepare_argparser():
+  description = "Make wig file for given bed using bam"
+  epilog = "For command line options of each command, type %(prog)% COMMAND -h"
+  argparser = ap.ArgumentParser(description=description, epilog = epilog)
+  argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file")
+  argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
+  #argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format")
+  #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
+  #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
+  return(argparser)
+
+def run_qc(files, controls, labels):
+  mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz"
+  #p = subprocess.Popen(mbs_command, shell=True)
+  #logging.debug(mbs_command)
+  #p.communicate()
+  pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers --outFileCorMatrix experiment.deeptools.spearmanCorr_readCounts.tab -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels)
+  #logging.debug(pcor_command)
+  #p = subprocess.Popen(pcor_command, shell=True)
+  #p.communicate()
+  #plotCoverage
+  pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10"
+  p = subprocess.Popen(pcov_command, shell=True)
+  p.communicate()
+  #draw fingerprints plots
+  #for treat,ctrl,name in zip(files,controls,labels):
+  #  fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png"
+  #  p = subprocess.Popen(fp_command, shell=True)
+  #  p.communicate()
+
+def bam2bw_wrapper(command):
+  p = subprocess.Popen(command, shell=True)
+  p.communicate()
+
+def run_signal(files, labels, genome):
+  #compute matrix and draw profile and heatmap
+  gene_bed = "/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed"
+  bw_commands = []
+  for f in files:
+    bw_commands.append("bamCoverage -b "+f+" -o "+f.replace("bam","bw"))
+  work_pool = Pool(min(len(files), 12))
+  work_pool.map(bam2bw_wrapper, bw_commands)
+  work_pool.close()
+  work_pool.join()
+  
+  #cm_command = "computeMatrix scale-regions "
+
+def run(dfile,genome):
+  #Get annotation address
+  tss_bed = "/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/tss.bed"
+  #parse dfile, suppose data files are the same folder as design file
+  dfile = pd.read_csv(dfile)
+  #QC: multiBamSummary and plotCorrelation
+  run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID']) 
+  
+
+def main():
+  argparser = prepare_argparser()
+  args = argparser.parse_args()
+  run(args.infile, args.genome)
+  
+
+if __name__=="__main__":
+  main()
-- 
GitLab