Skip to content
Snippets Groups Projects
Commit ad1a4aa5 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

First test at using fastqc.

parent c81ad56c
Branches
Tags
1 merge request!4Resolve "Add in Fastqc"
......@@ -33,12 +33,13 @@ documentation_files:
# NEXTFLOW WORKFLOW CONFIGURATION
# -----------------------------------------------------------------------------
# Remember - The workflow file is always named 'workflow/main.f'
# Remember - The workflow file is always named 'workflow/main.nf'
# The workflow must publish all final output into $baseDir
# A list of clueter environment modules that this workflow requires to run.
# Specify versioned module names to ensure reproducability.
workflow_modules:
- 'fastqc/0.11.5'
- 'deeptools/2.3.5'
- 'meme/4.11.1-gcc-openmpi'
......@@ -76,45 +77,21 @@ workflow_modules:
workflow_parameters:
- id: bams
- id: reads
type: files
required: true
description: |
Bam files of all samples
regex: ".*(bam|BAM)"
Fastq files of all samples
regex: "*_{1,2}.fastq.gz"
- id: peaks
type: files
required: true
description: |
Peak files of all samples. Peaks should be sorted by user using either p_value or intensity of the signals.Bed format.
regex: ".*(narrowPeak|broadPeak|bed|BED)"
- id: design
type: files
required: true
regex: ".*(csv)"
description: |
A design file listing pairs of sample name and sample group. Must be in csv format
Columns must include: SampleID,Tissue, Factor, Condition, Replicate, Peaks, bamReads, bamControl, ControlID, PeakCaller
- id: genomepath
- id: singleEnd
type: select
choices:
- [ '/project/shared/bicf_workflow_ref/GRCh38', 'human GRCh38']
- [ '/project/shared/bicf_workflow_ref/GRCh37', 'human GRCh37']
- [ '/project/shared/bicf_workflow_ref/GRCm38', 'mouse GRCm38']
required: true
choices:
- [ 'true', 'True']
- [ 'false', 'False']
description: |
Reference genome for annotation
- id: toppeakcount
type: integer
required: true
default: -1
description: |
The number of top peaks to use for motif discovery. This program will nott sort peak BED files for you, so please make sure your peak files are already sorted.If want all peaks to be used, use -1.
In single-end sequencing, the sequencer reads a fragment from only one end to the other, generating the sequence of base pairs. In paired-end reading it starts at one read, finishes this direction at the specified read length, and then starts another round of reading from the opposite end of the fragment.
# -----------------------------------------------------------------------------
# SHINY APP CONFIGURATION
......@@ -139,4 +116,3 @@ vizapp_bioc_packages:
# - ballgown
vizapp_github_packages:
- js229/Vennerable
#!/usr/bin/env nextflow
params.design="$baseDir/../test_data/samplesheet.csv"
params.bams = "$baseDir/../test_data/*.bam"
params.peaks = "$baseDir/../test_data/*.broadPeak"
params.genomepath="/project/shared/bicf_workflow_ref/GRCh37"
toppeakcount = -1
design_file = file(params.design)
deeptools_design = Channel.fromPath(params.design)
diffbind_design = Channel.fromPath(params.design)
chipseeker_design = Channel.fromPath(params.design)
meme_design = Channel.fromPath(params.design)
index_bams = Channel.fromPath(params.bams)
deeptools_bams = Channel.fromPath(params.bams)
deeptools_peaks = Channel.fromPath(params.peaks)
chipseeker_peaks = Channel.fromPath(params.peaks)
diffbind_bams = Channel.fromPath(params.bams)
diffbind_peaks = Channel.fromPath(params.peaks)
meme_peaks = Channel.fromPath(params.peaks)
process bamindex {
publishDir "$baseDir/output/", mode: 'copy'
input:
file index_bam_files from index_bams
output:
file "*bai" into deeptools_bamindex
file "*bai" into diffbind_bamindex
// Path to an input file, or a pattern for multiple inputs
// Note - $baseDir is the location of this workflow file main.nf
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module load samtools/intel/1.3
samtools index $index_bam_files
"""
}
params.reads = "$baseDir/../test_data/*_{1,2}.fastq.gz"
params.singleEnd = false
process run_deeptools {
publishDir "$baseDir/output", mode: 'copy'
input:
file deeptools_design_file from deeptools_design
file deeptools_bam_files from deeptools_bams.toList()
file deeptools_peak_files from deeptools_peaks.toList()
file deeptools_bam_indexes from deeptools_bamindex.toList()
output:
file "*deeptools*" into deeptools_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module load deeptools/2.3.5
python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}}
"""
}
Channel
.fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 )
.ifEmpty { error "Cannot find any reads matching: ${params.reads}\nIf this is single-end data, please specify."" }
.set { read_pairs }
process run_diffbind {
publishDir "$baseDir/output", mode: 'copy'
input:
file diffbind_design_file from diffbind_design
file diffbind_bam_files from diffbind_bams.toList()
file diffbind_peak_files from diffbind_peaks.toList()
file diffbind_bam_indexes from diffbind_bamindex.toList()
output:
file "diffpeak.design" into diffpeaksdesign_chipseeker
file "diffpeak.design" into diffpeaksdesign_meme
file "*_diffbind.bed" into diffpeaks_meme
file "*_diffbind.bed" into diffpeaks_chipseeker
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
"""
}
process qc_fastq {
tag "$name"
process run_chipseeker_diffpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file diffpeak_design_file from diffpeaksdesign_chipseeker
file diffpeaks from diffpeaks_chipseeker
output:
file "*chipseeker*" into chipseeker_diffpeak_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath}
"""
}
publishDir "$baseDir/output/$name/qc_fastq", mode: 'copy'
process run_chipseeker_originalpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file design_file from chipseeker_design
file chipseeker_peak_files from chipseeker_peaks.toList()
output:
file "*chipseeker*" into chipseeker_originalpeak_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath}
"""
}
process run_meme_original {
publishDir "$baseDir/output", mode: 'copy'
input:
file design_meme from meme_design
file meme_peak_files from meme_peaks.toList()
output:
file "*meme*" into meme_original_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module add deeptools/2.3.5
module load meme/4.11.1-gcc-openmpi
python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount}
"""
}
input:
set val(name), file(reads) from read_pairs
process run_meme_diffpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file peaks_meme from diffpeaks_meme
file diffpeak_design from diffpeaksdesign_meme
output:
file "*meme*" into meme_diffpeak_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module add deeptools/2.3.5
module load meme/4.11.1-gcc-openmpi
python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount}
"""
}
output:
file "*_fastqc.{zip,html}" into fastqc_results
script:
"""
module load fastqc/0.11.5
$baseDir/scripts/qc_fastq.py -f $reads
"""
}
#!/usr/bin/env python3
# -*- coding: latin-1 -*-
'''QC check of raw .fastq files using FASTQC.'''
import os
import subprocess
import argparse
import shlex
import shutil
from multiprocessing import cpu_count
import logging
import sys
import json
EPILOG = '''
For more details:
%(prog)s --help
'''
## SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
fastqc_path = shutil.which("fastqc")
if fastqc_path:
logger.info('Found fastqc:%s' % (fastqc_path))
else:
print("Please install 'fastqc' before using the tool")
sys.exit()
def get_args():
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f', '--fastq',
help="The fastq file to run QC check on.",
nargs='+',
required=True)
args = parser.parse_args()
return args
def check_qual_fastq(fastq):
'''Run fastqc on 1 or 2 files.'''
qc_command = "fastqc -t -f fastq " + " ".join(fastq)
logger.info("Running fastqc with %s" % (qc_command))
p = subprocess.Popen(qc_command, shell=True)
p.communicate()
def main():
args = get_args()
# create a file handler
handler = logging.FileHandler('qc.log')
logger.addHandler(handler)
check_qual_fastq(args.fastq)
if __name__ == '__main__':
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment