First test at using fastqc.

ad1a4aa5 · Venkat Malladi · c81ad56c · ad1a4aa5 · ad1a4aa5 · ad1a4aa5
Commit ad1a4aa5 authored 7 years ago by Venkat Malladi
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -33,12 +33,13 @@ documentation_files:
 # NEXTFLOW WORKFLOW CONFIGURATION
 # -----------------------------------------------------------------------------

-# Remember - The workflow file is always named 'workflow/main.f'
+# Remember - The workflow file is always named 'workflow/main.nf'
 #            The workflow must publish all final output into $baseDir

 # A list of clueter environment modules that this workflow requires to run.
 # Specify versioned module names to ensure reproducability.
 workflow_modules:
+  - 'fastqc/0.11.5'
  - 'deeptools/2.3.5'
  - 'meme/4.11.1-gcc-openmpi'

@@ -76,45 +77,21 @@ workflow_modules:

 workflow_parameters:

-  - id: bams
+  - id: reads
    type: files
    required: true
    description: |
-      Bam files of all samples
-    regex: ".*(bam|BAM)"
+      Fastq files of all  samples
+    regex: "*_{1,2}.fastq.gz"

-  - id: peaks
-    type: files
-    required: true
-    description: |
-      Peak files of all samples. Peaks should be sorted by user using either p_value or intensity of the signals.Bed format.
-    regex: ".*(narrowPeak|broadPeak|bed|BED)"
-
-
-  - id: design
-    type: files
-    required: true
-    regex: ".*(csv)"
-    description: |
-      A design file listing pairs of sample name and sample group. Must be in csv format
-      Columns must include: SampleID,Tissue, Factor, Condition, Replicate, Peaks, bamReads, bamControl, ControlID, PeakCaller
-
-  - id: genomepath
+  - id: singleEnd
    type: select
-    choices:
-      - [ '/project/shared/bicf_workflow_ref/GRCh38', 'human GRCh38']
-      - [ '/project/shared/bicf_workflow_ref/GRCh37', 'human GRCh37']
-      - [ '/project/shared/bicf_workflow_ref/GRCm38', 'mouse GRCm38']
    required: true
+    choices:
+      - [ 'true', 'True']
+      - [ 'false', 'False']
    description: |
-      Reference genome for annotation
-
-  - id: toppeakcount
-    type: integer
-    required: true
-    default: -1
-    description: |
-      The number of top peaks to use for motif discovery. This program will nott sort peak BED files for you, so please make sure your peak files are already sorted.If want all peaks to be used, use -1.
+      In single-end sequencing, the sequencer reads a fragment from only one end to the other, generating the sequence of base pairs. In paired-end reading it starts at one read, finishes this direction at the specified read length, and then starts another round of reading from the opposite end of the fragment.

 # -----------------------------------------------------------------------------
 # SHINY APP CONFIGURATION
@@ -139,4 +116,3 @@ vizapp_bioc_packages:
 #  - ballgown
 vizapp_github_packages:
  - js229/Vennerable
-
--- a/workflow/main.nf
+++ b/workflow/main.nf
 #!/usr/bin/env nextflow
-   params.design="$baseDir/../test_data/samplesheet.csv"
-   params.bams = "$baseDir/../test_data/*.bam"
-   params.peaks = "$baseDir/../test_data/*.broadPeak"
-   params.genomepath="/project/shared/bicf_workflow_ref/GRCh37"
-   toppeakcount = -1
-   design_file = file(params.design)
-   deeptools_design = Channel.fromPath(params.design)
-   diffbind_design = Channel.fromPath(params.design)
-   chipseeker_design = Channel.fromPath(params.design)
-   meme_design = Channel.fromPath(params.design)
-   index_bams = Channel.fromPath(params.bams)
-   deeptools_bams = Channel.fromPath(params.bams) 
-   deeptools_peaks = Channel.fromPath(params.peaks) 
-   chipseeker_peaks = Channel.fromPath(params.peaks) 
-   diffbind_bams = Channel.fromPath(params.bams) 
-   diffbind_peaks = Channel.fromPath(params.peaks) 
-   meme_peaks = Channel.fromPath(params.peaks)

-process bamindex {
-   publishDir "$baseDir/output/", mode: 'copy'
-   input:
-     file index_bam_files from index_bams
-   output:
-     file "*bai" into deeptools_bamindex
-     file "*bai" into diffbind_bamindex
+// Path to an input file, or a pattern for multiple inputs
+// Note - $baseDir is the location of this workflow file main.nf

-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module load samtools/intel/1.3
-     samtools index $index_bam_files
-     """
-}
+params.reads = "$baseDir/../test_data/*_{1,2}.fastq.gz"
+params.singleEnd = false

-process run_deeptools {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file deeptools_design_file from deeptools_design
-     file deeptools_bam_files from deeptools_bams.toList()
-     file deeptools_peak_files from deeptools_peaks.toList()
-     file deeptools_bam_indexes from deeptools_bamindex.toList()
-   output:
-     file "*deeptools*" into deeptools_output
-     script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module load deeptools/2.3.5
-     python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}}
-"""
-}

+Channel
+    .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 )
+    .ifEmpty { error "Cannot find any reads matching: ${params.reads}\nIf this is single-end data, please specify."" }
+    .set { read_pairs }

-process run_diffbind {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file diffbind_design_file from diffbind_design
-     file diffbind_bam_files from diffbind_bams.toList()
-     file diffbind_peak_files from diffbind_peaks.toList()
-     file diffbind_bam_indexes from diffbind_bamindex.toList()
-   output:
-     file "diffpeak.design" into diffpeaksdesign_chipseeker
-     file "diffpeak.design" into diffpeaksdesign_meme
-     file "*_diffbind.bed" into diffpeaks_meme
-     file "*_diffbind.bed" into diffpeaks_chipseeker
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
-"""
-}
+process qc_fastq {
+    tag "$name"

-process run_chipseeker_diffpeak {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file diffpeak_design_file from diffpeaksdesign_chipseeker
-     file diffpeaks from diffpeaks_chipseeker
-   output:
-     file "*chipseeker*" into chipseeker_diffpeak_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath}
-"""
-}
+    publishDir "$baseDir/output/$name/qc_fastq", mode: 'copy'

-process run_chipseeker_originalpeak {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file design_file from chipseeker_design
-     file chipseeker_peak_files from chipseeker_peaks.toList()
-   output:
-     file "*chipseeker*" into chipseeker_originalpeak_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath}
-"""
-}

-process run_meme_original {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file design_meme from meme_design
-     file meme_peak_files from meme_peaks.toList()
-   output:
-     file "*meme*" into meme_original_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module add deeptools/2.3.5
-     module load meme/4.11.1-gcc-openmpi
-     python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount}
-"""
-}
+    input:
+    set val(name), file(reads) from read_pairs

-process run_meme_diffpeak {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file peaks_meme from diffpeaks_meme
-     file diffpeak_design from diffpeaksdesign_meme
-   output:
-     file "*meme*" into meme_diffpeak_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module add deeptools/2.3.5
-     module load meme/4.11.1-gcc-openmpi
-     python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount}
-"""
-}
+    output:
+    file "*_fastqc.{zip,html}" into fastqc_results

+    script:
+    """
+    module load fastqc/0.11.5
+    $baseDir/scripts/qc_fastq.py -f $reads
+    """
+    }
--- a/workflow/scripts/qc_fastq.py
+++ b/workflow/scripts/qc_fastq.py
+#!/usr/bin/env python3
+# -*- coding: latin-1 -*-
+'''QC check of raw .fastq files using FASTQC.'''
+
+import os
+import subprocess
+import argparse
+import shlex
+import shutil
+from multiprocessing import cpu_count
+import logging
+import sys
+import json
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+## SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    fastqc_path = shutil.which("fastqc")
+    if fastqc_path:
+        logger.info('Found fastqc:%s' % (fastqc_path))
+    else:
+        print("Please install 'fastqc' before using the tool")
+        sys.exit()
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-f', '--fastq',
+        help="The fastq file to run QC check on.",
+        nargs='+',
+        required=True)
+
+    args = parser.parse_args()
+    return args
+
+
+def check_qual_fastq(fastq):
+    '''Run fastqc on 1 or 2 files.'''
+    qc_command = "fastqc -t -f fastq " + " ".join(fastq)
+
+    logger.info("Running fastqc with %s" % (qc_command))
+
+    p = subprocess.Popen(qc_command, shell=True)
+    p.communicate()
+
+
+def main():
+    args = get_args()
+
+    # create a file handler
+    handler = logging.FileHandler('qc.log')
+    logger.addHandler(handler)
+
+    check_qual_fastq(args.fastq)
+
+
+if __name__ == '__main__':
+    main()