Add align sampled data

2b89cc74 · Gervaise Henry · 0f30eb62 · 2b89cc74 · 2b89cc74 · 2b89cc74
Commit 2b89cc74 authored 5 years ago by Gervaise Henry
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -12,6 +12,9 @@ process {
  withName:parseMetadata {
    executor = 'local'
  }
+  withName: getRefInfer {
+    executor = 'local'
+  }
  withName:getRef {
    executor = 'local'
  }
@@ -21,6 +24,9 @@ process {
  withName:downsampleData {
    executor = 'local'
  }
+  withName:alignSampleData {
+    queue = 'super'
+  }
  withName:alignData {
    queue = '256GB,256GBv1'
  }

--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -20,6 +20,9 @@ process {
  withName: parseMetadata {
    container = 'bicf/python3:1.3'
  }
+  withName: getRefInfer {
+    container = 'bicf/awscli:1.1'
+  }
  withName: getRef {
    container = 'bicf/awscli:1.1'
  }
@@ -29,6 +32,9 @@ process {
  withName: downsampleData {
    container = 'bicf/seqtk:2.0.0'
  }
+  withName: alignSampleData {
+    container = 'bicf/gudmaprbkaligner:2.0.0'
+  }
  withName: alignData {
    container = 'bicf/gudmaprbkaligner:2.0.0'
  }

--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -118,7 +118,7 @@ process getData {
    """
 }

-// Replicate raw fastqs for multiple process inputs
+// Replicate raw fastq's for multiple process inputs
 fastqs.into {
  fastqs_downsampleData
  fastqs_trimData
@@ -195,6 +195,7 @@ metadata.splitCsv(sep: ",", header: false).separate(
 // Replicate metadata for multiple process inputs
 endsManual.into {
  endsManual_downsampleData
+  endsManual_alignSampleData
  endsManual_trimData
  endsManual_alignData
  endsManual_featureCounts
@@ -221,7 +222,7 @@ process getRefInfer {
    val referenceInfer

  output:
-    tuple val ("${referenceInfer}"), path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf")  into refInfer
+    tuple val (referenceInfer), path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf")  into refInfer
    path ("${repRID}.getRefInfer.{out,err}")
 
  script:
@@ -359,7 +360,7 @@ process trimData {
    hostname > ${repRID}.trimData.err
    ulimit -a >> ${repRID}.trimData.err

-    #Trim fastqs using trim_galore
+    #Trim fastq's using trim_galore
    if [ "${endsManual_trimData}" == "se" ]
    then
      echo "LOG: running trim_galore using single-end settings" >> ${repRID}.trimData.err
@@ -372,7 +373,7 @@ process trimData {
    """
 }

-// Replicate trimmed fastqs
+// Replicate trimmed fastq's
 fastqsTrim.into {
  fastqsTrim_downsampleData
  fastqsTrim_alignData
@@ -390,7 +391,8 @@ process downsampleData {
    path fastq from fastqsTrim_downsampleData

  output:
-    path ("sampled.{1,2}.fq") into fastqsSample
+    path ("sampled.1.fq") into fastqs1Sample
+    path ("sampled.2.fq") optional true into fastqs2Sample
    path ("${repRID}.downsampleData.{out,err}")

  script:
@@ -413,6 +415,54 @@ process downsampleData {
    """
 }

+// Replicate the dowsampled fastq's and attatched to the references
+inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect())))
+
+/*
+ * alignSampleData: aligns the downsampled reads to a reference database
+*/
+process alignSampleData {
+  tag "${ref}"
+  publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.alignSampleData.{out,err}"
+
+  input:
+    tuple val (ends), val (ref), path (hisat2), path (bed), path (fna), path (gtf), path (fastq1), path (fastq2) from inferInput
+
+  output:
+    tuple val (ref), path ("sampled.sorted.bam"), path ("sampled.sorted.bam.bai"), path (bed) into sampleBam
+    path ("*.alignSampleSummary.txt") into alignSampleQC
+    path ("${repRID}.alignSampleData.{out,err}")
+
+  script:
+    """
+    hostname > ${repRID}.alignSampleData.err
+    ulimit -a >> ${repRID}.alignSampleData.err
+
+    #Align the reads with Hisat 2
+    if [ "${ends}" == "se" ]
+    then
+      echo "LOG: running Hisat2 with single-end settings" >> ${repRID}.align.err
+      hisat2 -p `nproc` --add-chrname -S sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${repRID}.alignSampleSummary.txt --new-summary 1>> ${repRID}.alignSampleData.out 2>> ${repRID}.alignSampleData.err
+    elif [ "${ends}" == "pe" ]
+    then
+      echo "LOG: running Hisat2 with paired-end settings" >> ${repRID}.align.err
+      hisat2 -p `nproc` --add-chrname -S sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${repRID}.alignSampleSummary.txt --new-summary 1>> ${repRID}.alignSampleData.out 2>> ${repRID}.alignSampleData.err
+    fi
+    
+    #Convert the output sam file to a sorted bam file using Samtools
+    echo "LOG: converting from sam to bam" >> ${repRID}.alignSampleData.err
+    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o sampled.bam sampled.sam 1>> ${repRID}.alignSampleData.out 2>> ${repRID}.alignSampleData.err;
+
+    #Sort the bam file using Samtools
+    echo "LOG: sorting the bam file" >> ${repRID}.alignSampleData.err
+    samtools sort -@ `nproc` -O BAM -o sampled.sorted.bam sampled.bam 1>> ${repRID}.alignSampleData.out 2>> ${repRID}.alignSampleData.err;
+
+    #Index the sorted bam using Samtools
+    echo "LOG: indexing sorted bam file" >> ${repRID}.alignSampleData.err
+    samtools index -@ `nproc` -b sampled.sorted.bam sampled.sorted.bam.bai 1>> ${repRID}.alignSampleData.out 2>> ${repRID}.alignSampleData.err;
+    """
+}
+
 /*
 * alignData: aligns the reads to a reference database
 */