Merge branch '14-FixTrimReads' into 'master'

Resolve "Fix Trim Reads" Closes #14 See merge request !7

Merge branch '14-FixTrimReads' into 'master'
Resolve "Fix Trim Reads" Closes #14 See merge request !7
e6657f34 · Venkat Malladi · 9c3eebed · 4a6eb2b8 · e6657f34 · e6657f34
Commit e6657f34 authored 5 years ago by Venkat Malladi
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
 ##### 1) Fastq Files
  + You will need the full path to the files for the Bash Scipt

+
 ## Design file
  + The Design file is a tab-delimited file with 4 columns for Single-End and 5 columns for Paired-End.  Letter, numbers, and underlines can be used in the names. However, the names must begin with a letter. Columns must be as follows:

@@ -41,34 +42,28 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
  + See [HERE](/docs/design_ENCSR265ZXX_SE.txt) for an example design file, single-end


-## SOP
-This SOP describes the analysis pipeline of downstream analysis of ChIP-seq sequencing data. This pipeline includes (1) Quality control using Deeptools, (2) Peak annotation, (3) Differential peak analysis, and (4) motif analysis. BAM files and SORTED peak BED files selected as input. For each sample this workflow:
-
-    1) Annotate all peaks using ChipSeeker
-    2) Qulity control and signal profiling with Deeptools
-    3) Find differential expressed peaks using DiffBind
-    4) Annotate all differentially expressed peaks
-    5) Using MEME-ChIP in motif finding for both original peaks and differently expressed peaks
-
-## Annotations used in the pipeline
+## Pipeline
+  + There are ??? steps to the pipeline
+    1. Check input files
+    2. Trim adaptors with TrimGalore!

-    ChipSeeker - Known gene from Bioconductor [TxDb annotation](https://bioconductor.org/packages/release/BiocViews.html#___TxDb)
-    Deeptools - RefGene downloaded from UCSC Table browser

-## Workflow Parameters
+## Output Files
+Folder | File | Description
+--- | --- | ---
+design | N/A | Inputs used for analysis; can ignore
+trimReads | *_trimming_report.txt | report detailing how many reads were trimmed
+trimReads | *_trimmed.fq.gz | trimmed fastq files used for analysis

-    bam - Choose all ChIP-seq alignment files for analysis.
-    genome - Choose a genomic reference (genome).
-    peaks - Choose all the peak files for analysis. All peaks should be sorted by the user
-    design - Choose the file with the experiment design information. CSV format
-    toppeak - The number of top peaks used for motif analysis. Default is all

 ## Common Errors
 If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know and we will add it here.

+
 ## Citation
 Please cite individual programs and versions used [HERE](docs/references.md), and the pipeline doi: coming soon. Please cite in publications: Pipeline was developed by BICF from funding provided by Cancer Prevention and Research Institute of Texas (RP150596).

+
 ### Credits
 This example worklow is derived from original scripts kindly contributed by the Bioinformatic Core Facility ([BICF](https://www.utsouthwestern.edu/labs/bioinformatics/)), in the [Department of Bioinformatics](https://www.utsouthwestern.edu/departments/bioinformatics/).

--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -89,40 +89,57 @@ process checkDesignFile {
 if (pairedEnd) {
  rawReads = designFilePaths
    .splitCsv(sep: '\t', header: true)
-    .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.replicate ] }
-} else {
+    .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.replicate, row.fq_length ] }
+} 
+else {
 rawReads = designFilePaths
  .splitCsv(sep: '\t', header: true)
-  .map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.replicate ] }
+  .map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.replicate, row.fq_length ] }
 }

 // Trim raw reads using trimgalore
 process trimReads {

-  tag "$sampleId-$replicate"
-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  tag "${sampleId}-${replicate}"
+  publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'

  input:
-
-  set sampleId, reads, experimentId, replicate from rawReads
+    set sampleId, reads, experimentId, replicate, fqLength from rawReads

  output:
-
-  set sampleId, file('*.fq.gz'), experimentId, replicate into trimmedReads
-  file('*trimming_report.txt') into trimgalore_results
+    set sampleId, file('*.fq.gz'), experimentId, replicate, fqLength into trimmedReads
+    file('*trimming_report.txt') into trimgaloreResults
+    file('version_*.txt') into trimReadsVersions

  script:
-
-  if (pairedEnd) {
-    """
-    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
-    """
-  }
-  else {
-    """
-    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
-    """
-  }
+    if (params.astrocyte == true) {
+      if (pairedEnd) {
+        """
+        module load python/3.6.1-2-anaconda
+        module load trimgalore/0.4.1
+        python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -s ${sampleId} -p
+        """
+      }
+      else {
+        """
+        module load python/3.6.1-2-anaconda
+        module load trimgalore/0.4.1
+        python3 ${baseDir}/scripts/trim_reads.py -f ${reads[0]} -s ${sampleId}
+        """
+      }
+    }
+    else {
+      if (pairedEnd) {
+        """
+        python3 ${baseDir}/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -s ${sampleId} -p
+        """
+      }
+      else {
+        """
+        python3 ${baseDir}/scripts/trim_reads.py -f ${reads[0]} -s ${sampleId}
+        """
+      }
+    }

 }


--- a/workflow/scripts/trim_reads.py
+++ b/workflow/scripts/trim_reads.py
@@ -5,6 +5,7 @@
 import subprocess
 import argparse
 import shutil
+import os
 import logging

 EPILOG = '''
@@ -32,6 +33,10 @@ def get_args():
                        nargs='+',
                        required=True)

+    parser.add_argument('-s', '--sample',
+                        help="The name of the sample.",
+                        required=True)
+
    parser.add_argument('-p', '--paired',
                        help="True/False if paired-end or single end.",
                        default=False,
@@ -49,6 +54,15 @@ def check_tools():
    trimgalore_path = shutil.which("trim_galore")
    if trimgalore_path:
        logger.info('Found trimgalore: %s', trimgalore_path)
+
+        # Get Version
+        trim_version_command = "trim_galore --version"
+        trimgalore_version = subprocess.check_output(trim_version_command, shell=True)
+
+        # Write to file
+        trimgalore_file = open("version_trimgalore.txt", "wb")
+        trimgalore_file.write(trimgalore_version)
+        trimgalore_file.close()
    else:
        logger.error('Missing trimgalore')
        raise Exception('Missing trimgalore')
@@ -56,11 +70,46 @@ def check_tools():
    cutadapt_path = shutil.which("cutadapt")
    if cutadapt_path:
        logger.info('Found cutadapt: %s', cutadapt_path)
+
+        # Get Version
+        cutadapt_version_command = "cutadapt --version"
+        cutadapt_version = subprocess.check_output(cutadapt_version_command, shell=True)
+
+        # Write to file
+        cutadapt_file = open("version_cutadapt.txt", "wb")
+        cutadapt_file.write(b"Version %s" % (cutadapt_version))
+        cutadapt_file.close()
    else:
        logger.error('Missing cutadapt')
        raise Exception('Missing cutadapt')


+def rename_reads(fastq, sample, paired):
+    '''Rename fastq files by sample name.'''
+
+    # Get current directory to build paths
+    cwd = os.getcwd()
+
+    renamed_fastq = []
+
+    if paired:  # paired-end data
+        # Set file names
+        renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz')
+        renamed_fastq.append(cwd + '/' + sample + '_R2.fastq.gz')
+
+        # Great symbolic links
+        os.symlink(fastq[0], renamed_fastq[0])
+        os.symlink(fastq[1], renamed_fastq[1])
+    else:
+        # Set file names
+        renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz')
+
+        # Great symbolic links
+        os.symlink(fastq[0], renamed_fastq[0])
+
+    return renamed_fastq
+
+
 def trim_reads(fastq, paired):
    '''Run trim_galore on 1 or 2 files.'''

@@ -82,6 +131,7 @@ def trim_reads(fastq, paired):
 def main():
    args = get_args()
    fastq = args.fastq
+    sample = args.sample
    paired = args.paired

    # Create a file handler
@@ -91,8 +141,11 @@ def main():
    # Check if tools are present
    check_tools()

+    # Rename fastq files by sample
+    fastq_rename = rename_reads(fastq, sample, paired)
+
    # Run trim_reads
-    trim_reads(fastq, paired)
+    trim_reads(fastq_rename, paired)


 if __name__ == '__main__':

--- a/workflow/tests/test_trim_reads.py
+++ b/workflow/tests/test_trim_reads.py
@@ -10,19 +10,39 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \



-@pytest.mark.integration
-def test_trim_reads_singleend():
-    #raw_fastq = test_data_path + 'ENCFF833BLU.fastq.gz'
-    #trimmed_fastq = test_output_path + 'ENCFF833BLU_trimmed.fq.gz'
-    #trimmed_fastq_report = test_output_path + \
-    #                        'ENCFF833BLU.fastq.gz_trimming_report.txt'
-    #assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq)
-    #assert os.path.getsize(trimmed_fastq) == 2512853101
-    #assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4]
-    pass
-
-
-@pytest.mark.integration
-def test_trim_reads_pairedend():
-    # Do the same thing for paired end data
-    pass
+@pytest.mark.pairedend_mouse
+def test_trim_reads_pairedend_mouse():
+    raw_fastq1 = test_data_path + 'ENCFF913PMS.fastq.gz'
+    trimmed_fastq1 = test_output_path + 'ENCLB122XDP/ENCLB122XDP_R1_val_1.fq.gz'
+
+    raw_fastq2 = test_data_path + 'ENCFF999SZR.fastq.gz'
+    trimmed_fastq2 = test_output_path + 'ENCLB749GLW/ENCLB749GLW_R2_val_2.fq.gz'
+
+    assert os.path.getsize(raw_fastq1) != os.path.getsize(trimmed_fastq1)
+    assert os.path.getsize(trimmed_fastq1) == 934738906
+
+    assert os.path.getsize(raw_fastq2) != os.path.getsize(trimmed_fastq2)
+    assert os.path.getsize(trimmed_fastq2) == 1297020342
+
+    trimmed_fastq_report1 = test_output_path + \
+                             'ENCLB122XDP/ENCLB122XDP_R1.fastq.gz_trimming_report.txt'
+    assert 'Trimming mode: paired-end' in open(trimmed_fastq_report1).readlines()[4]
+
+    trimmed_fastq_report2 = test_output_path + \
+                             'ENCLB749GLW/ENCLB749GLW_R1.fastq.gz_trimming_report.txt'
+    assert 'Trimming mode: paired-end' in open(trimmed_fastq_report2).readlines()[4]
+
+
+@pytest.mark.singleend_human
+def test_trim_reads_singleend_human():
+    raw_fastq = test_data_path + 'ENCFF115PAE.fastq.gz'
+    trimmed_fastq = test_output_path + 'ENCLB170KFO/ENCLB170KFO_R1_trimmed.fq.gz'
+
+    assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq)
+    assert os.path.getsize(trimmed_fastq) == 2171147098
+
+    trimmed_fastq_report = test_output_path + \
+                            'ENCLB969KTX/ENCLB969KTX_R1.fastq.gz_trimming_report.txt'
+    assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4]
+
+