From ad1a4aa52a9c234c2efc0cbf2bdf9c8cf35606bb Mon Sep 17 00:00:00 2001 From: Venkat Malladi <venkat.malladi@utsouthwestern.edu> Date: Tue, 12 Sep 2017 15:30:30 -0500 Subject: [PATCH] First test at using fastqc. --- astrocyte_pkg.yml | 44 +++-------- workflow/main.nf | 149 +++++------------------------------ workflow/scripts/qc_fastq.py | 76 ++++++++++++++++++ 3 files changed, 107 insertions(+), 162 deletions(-) create mode 100644 workflow/scripts/qc_fastq.py diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 5c2f7f5..9b9ec96 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -33,12 +33,13 @@ documentation_files: # NEXTFLOW WORKFLOW CONFIGURATION # ----------------------------------------------------------------------------- -# Remember - The workflow file is always named 'workflow/main.f' +# Remember - The workflow file is always named 'workflow/main.nf' # The workflow must publish all final output into $baseDir # A list of clueter environment modules that this workflow requires to run. # Specify versioned module names to ensure reproducability. workflow_modules: + - 'fastqc/0.11.5' - 'deeptools/2.3.5' - 'meme/4.11.1-gcc-openmpi' @@ -76,45 +77,21 @@ workflow_modules: workflow_parameters: - - id: bams + - id: reads type: files required: true description: | - Bam files of all samples - regex: ".*(bam|BAM)" + Fastq files of all samples + regex: "*_{1,2}.fastq.gz" - - id: peaks - type: files - required: true - description: | - Peak files of all samples. Peaks should be sorted by user using either p_value or intensity of the signals.Bed format. - regex: ".*(narrowPeak|broadPeak|bed|BED)" - - - - id: design - type: files - required: true - regex: ".*(csv)" - description: | - A design file listing pairs of sample name and sample group. Must be in csv format - Columns must include: SampleID,Tissue, Factor, Condition, Replicate, Peaks, bamReads, bamControl, ControlID, PeakCaller - - - id: genomepath + - id: singleEnd type: select - choices: - - [ '/project/shared/bicf_workflow_ref/GRCh38', 'human GRCh38'] - - [ '/project/shared/bicf_workflow_ref/GRCh37', 'human GRCh37'] - - [ '/project/shared/bicf_workflow_ref/GRCm38', 'mouse GRCm38'] required: true + choices: + - [ 'true', 'True'] + - [ 'false', 'False'] description: | - Reference genome for annotation - - - id: toppeakcount - type: integer - required: true - default: -1 - description: | - The number of top peaks to use for motif discovery. This program will nott sort peak BED files for you, so please make sure your peak files are already sorted.If want all peaks to be used, use -1. + In single-end sequencing, the sequencer reads a fragment from only one end to the other, generating the sequence of base pairs. In paired-end reading it starts at one read, finishes this direction at the specified read length, and then starts another round of reading from the opposite end of the fragment. # ----------------------------------------------------------------------------- # SHINY APP CONFIGURATION @@ -139,4 +116,3 @@ vizapp_bioc_packages: # - ballgown vizapp_github_packages: - js229/Vennerable - diff --git a/workflow/main.nf b/workflow/main.nf index bb90254..b28c8e4 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,139 +1,32 @@ #!/usr/bin/env nextflow - params.design="$baseDir/../test_data/samplesheet.csv" - params.bams = "$baseDir/../test_data/*.bam" - params.peaks = "$baseDir/../test_data/*.broadPeak" - params.genomepath="/project/shared/bicf_workflow_ref/GRCh37" - toppeakcount = -1 - design_file = file(params.design) - deeptools_design = Channel.fromPath(params.design) - diffbind_design = Channel.fromPath(params.design) - chipseeker_design = Channel.fromPath(params.design) - meme_design = Channel.fromPath(params.design) - index_bams = Channel.fromPath(params.bams) - deeptools_bams = Channel.fromPath(params.bams) - deeptools_peaks = Channel.fromPath(params.peaks) - chipseeker_peaks = Channel.fromPath(params.peaks) - diffbind_bams = Channel.fromPath(params.bams) - diffbind_peaks = Channel.fromPath(params.peaks) - meme_peaks = Channel.fromPath(params.peaks) -process bamindex { - publishDir "$baseDir/output/", mode: 'copy' - input: - file index_bam_files from index_bams - output: - file "*bai" into deeptools_bamindex - file "*bai" into diffbind_bamindex +// Path to an input file, or a pattern for multiple inputs +// Note - $baseDir is the location of this workflow file main.nf - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module load samtools/intel/1.3 - samtools index $index_bam_files - """ -} +params.reads = "$baseDir/../test_data/*_{1,2}.fastq.gz" +params.singleEnd = false -process run_deeptools { - publishDir "$baseDir/output", mode: 'copy' - input: - file deeptools_design_file from deeptools_design - file deeptools_bam_files from deeptools_bams.toList() - file deeptools_peak_files from deeptools_peaks.toList() - file deeptools_bam_indexes from deeptools_bamindex.toList() - output: - file "*deeptools*" into deeptools_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module load deeptools/2.3.5 - python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}} -""" -} +Channel + .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 ) + .ifEmpty { error "Cannot find any reads matching: ${params.reads}\nIf this is single-end data, please specify."" } + .set { read_pairs } -process run_diffbind { - publishDir "$baseDir/output", mode: 'copy' - input: - file diffbind_design_file from diffbind_design - file diffbind_bam_files from diffbind_bams.toList() - file diffbind_peak_files from diffbind_peaks.toList() - file diffbind_bam_indexes from diffbind_bamindex.toList() - output: - file "diffpeak.design" into diffpeaksdesign_chipseeker - file "diffpeak.design" into diffpeaksdesign_meme - file "*_diffbind.bed" into diffpeaks_meme - file "*_diffbind.bed" into diffpeaks_chipseeker - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file -""" -} +process qc_fastq { + tag "$name" -process run_chipseeker_diffpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file diffpeak_design_file from diffpeaksdesign_chipseeker - file diffpeaks from diffpeaks_chipseeker - output: - file "*chipseeker*" into chipseeker_diffpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath} -""" -} + publishDir "$baseDir/output/$name/qc_fastq", mode: 'copy' -process run_chipseeker_originalpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file design_file from chipseeker_design - file chipseeker_peak_files from chipseeker_peaks.toList() - output: - file "*chipseeker*" into chipseeker_originalpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath} -""" -} -process run_meme_original { - publishDir "$baseDir/output", mode: 'copy' - input: - file design_meme from meme_design - file meme_peak_files from meme_peaks.toList() - output: - file "*meme*" into meme_original_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module add deeptools/2.3.5 - module load meme/4.11.1-gcc-openmpi - python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount} -""" -} + input: + set val(name), file(reads) from read_pairs -process run_meme_diffpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file peaks_meme from diffpeaks_meme - file diffpeak_design from diffpeaksdesign_meme - output: - file "*meme*" into meme_diffpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module add deeptools/2.3.5 - module load meme/4.11.1-gcc-openmpi - python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount} -""" -} + output: + file "*_fastqc.{zip,html}" into fastqc_results + script: + """ + module load fastqc/0.11.5 + $baseDir/scripts/qc_fastq.py -f $reads + """ + } diff --git a/workflow/scripts/qc_fastq.py b/workflow/scripts/qc_fastq.py new file mode 100644 index 0000000..2e23851 --- /dev/null +++ b/workflow/scripts/qc_fastq.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# -*- coding: latin-1 -*- +'''QC check of raw .fastq files using FASTQC.''' + +import os +import subprocess +import argparse +import shlex +import shutil +from multiprocessing import cpu_count +import logging +import sys +import json + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +## SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + fastqc_path = shutil.which("fastqc") + if fastqc_path: + logger.info('Found fastqc:%s' % (fastqc_path)) + else: + print("Please install 'fastqc' before using the tool") + sys.exit() + + +def get_args(): + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-f', '--fastq', + help="The fastq file to run QC check on.", + nargs='+', + required=True) + + args = parser.parse_args() + return args + + +def check_qual_fastq(fastq): + '''Run fastqc on 1 or 2 files.''' + qc_command = "fastqc -t -f fastq " + " ".join(fastq) + + logger.info("Running fastqc with %s" % (qc_command)) + + p = subprocess.Popen(qc_command, shell=True) + p.communicate() + + +def main(): + args = get_args() + + # create a file handler + handler = logging.FileHandler('qc.log') + logger.addHandler(handler) + + check_qual_fastq(args.fastq) + + +if __name__ == '__main__': + main() -- GitLab