Skip to content
Snippets Groups Projects
Commit c9f0a2c4 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '1-fastqc' into 'master'

Resolve "Add in Fastqc"

Closes #1

See merge request !4
parents 2728fc7e cb70f489
Branches
Tags
1 merge request!4Resolve "Add in Fastqc"
Pipeline #1015 failed with stage
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
before_script:
- module add python/3.6.1-2-anaconda
- pip install --user pytest-pythonpath
test:
script:
- module add python/3.6.1-2-anaconda
- pytest
......@@ -17,7 +17,8 @@ title: 'BICF ChIP-seq Analysis Workflow'
# A summary of the workflow package in plain text
description: |
This is a workflow package for the BioHPC/BICF ChIP-seq workflow system.
It implements a simple ChIP-seq analysis workflow using deepTools, Diffbind, ChipSeeker and MEME-ChIP, visualization application.
It implements a simple ChIP-seq analysis workflow and
visualization application.
# -----------------------------------------------------------------------------
# DOCUMENTATION
......@@ -33,12 +34,13 @@ documentation_files:
# NEXTFLOW WORKFLOW CONFIGURATION
# -----------------------------------------------------------------------------
# Remember - The workflow file is always named 'workflow/main.f'
# Remember - The workflow file is always named 'workflow/main.nf'
# The workflow must publish all final output into $baseDir
# A list of clueter environment modules that this workflow requires to run.
# A list of cluster environment modules that this workflow requires to run.
# Specify versioned module names to ensure reproducability.
workflow_modules:
- 'fastqc/0.11.5'
- 'deeptools/2.3.5'
- 'meme/4.11.1-gcc-openmpi'
......@@ -76,45 +78,25 @@ workflow_modules:
workflow_parameters:
- id: bams
- id: reads
type: files
required: true
description: |
Bam files of all samples
regex: ".*(bam|BAM)"
Fastq files of all samples
regex: "*_{1,2}.fastq.gz"
- id: peaks
type: files
required: true
description: |
Peak files of all samples. Peaks should be sorted by user using either p_value or intensity of the signals.Bed format.
regex: ".*(narrowPeak|broadPeak|bed|BED)"
- id: design
type: files
required: true
regex: ".*(csv)"
description: |
A design file listing pairs of sample name and sample group. Must be in csv format
Columns must include: SampleID,Tissue, Factor, Condition, Replicate, Peaks, bamReads, bamControl, ControlID, PeakCaller
- id: genomepath
- id: singleEnd
type: select
choices:
- [ '/project/shared/bicf_workflow_ref/GRCh38', 'human GRCh38']
- [ '/project/shared/bicf_workflow_ref/GRCh37', 'human GRCh37']
- [ '/project/shared/bicf_workflow_ref/GRCm38', 'mouse GRCm38']
required: true
description: |
Reference genome for annotation
- id: toppeakcount
type: integer
required: true
default: -1
choices:
- [ 'true', 'True']
- [ 'false', 'False']
description: |
The number of top peaks to use for motif discovery. This program will nott sort peak BED files for you, so please make sure your peak files are already sorted.If want all peaks to be used, use -1.
In single-end sequencing, the sequencer reads a fragment from only one
end to the other, generating the sequence of base pairs. In paired-end
reading it starts at one read, finishes this direction at the specified
read length, and then starts another round of reading from the opposite
end of the fragment.
# -----------------------------------------------------------------------------
# SHINY APP CONFIGURATION
......@@ -132,11 +114,10 @@ vizapp_cran_packages:
- shiny
- shinyFiles
# # List of any Bioconductor packages, not provided by the modules, that must be made
# available to the vizapp
# List of any Bioconductor packages, not provided by the modules,
# that must be made available to the vizapp
vizapp_bioc_packages:
- qusage
# - ballgown
vizapp_github_packages:
- js229/Vennerable
this is a read me
[pytest]
python_paths = workflow/scripts
sample_id biosample factor treatment replicate control_id fastq_read1
ENCSR238SGC limb H3K4me1 None 1 ENCSR687ALB ENCFF833BLU.fastq.gz
ENCSR238SGC limb H3K4me1 None 2 ENCSR687ALB ENCFF646LXU.fastq.gz
ENCSR687ALB limb Control None 1 ENCSR687ALB ENCFF524CAC.fastq.gz
ENCSR687ALB limb Control None 2 ENCSR687ALB ENCFF163AJI.fastq.gz
sample_id biosample factor treatment replicate control_id fastq_read1 fastq_read2
ENCSR729LGA MCF-7 SP1 None 1 ENCSR217LRF ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz
ENCSR729LGA MCF-7 SP1 None 2 ENCSR217LRF ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz
ENCSR217LRF MCF-7 Control None 1 ENCSR217LRF ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz
ENCSR217LRF MCF-7 Control None 1 ENCSR217LRF ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz
echo "Downloading Single-end data set Mouse ENCSR238SGC and ENCSR687ALB"
wget https://www.encodeproject.org/files/ENCFF833BLU/@@download/ENCFF833BLU.fastq.gz
wget https://www.encodeproject.org/files/ENCFF646LXU/@@download/ENCFF646LXU.fastq.gz
wget https://www.encodeproject.org/files/ENCFF524CAC/@@download/ENCFF524CAC.fastq.gz
wget https://www.encodeproject.org/files/ENCFF163AJI/@@download/ENCFF163AJI.fastq.gz
echo "Done with Single-end"
echo "Downloading Paired-end data set Human ENCSR729LGA and ENCSR217LRF"
wget https://www.encodeproject.org/files/ENCFF957SQS/@@download/ENCFF957SQS.fastq.gz
wget https://www.encodeproject.org/files/ENCFF582IOZ/@@download/ENCFF582IOZ.fastq.gz
wget https://www.encodeproject.org/files/ENCFF330MCZ/@@download/ENCFF330MCZ.fastq.gz
wget https://www.encodeproject.org/files/ENCFF293YFE/@@download/ENCFF293YFE.fastq.gz
wget https://www.encodeproject.org/files/ENCFF002DTU/@@download/ENCFF002DTU.fastq.gz
wget https://www.encodeproject.org/files/ENCFF002EFI/@@download/ENCFF002EFI.fastq.gz
wget https://www.encodeproject.org/files/ENCFF002EFG/@@download/ENCFF002EFG.fastq.gz
wget https://www.encodeproject.org/files/ENCFF002DTS/@@download/ENCFF002DTS.fastq.gz
echo "Done with Paired-end"
#!/usr/bin/env nextflow
params.design="$baseDir/../test_data/samplesheet.csv"
params.bams = "$baseDir/../test_data/*.bam"
params.peaks = "$baseDir/../test_data/*.broadPeak"
params.genomepath="/project/shared/bicf_workflow_ref/GRCh37"
toppeakcount = -1
design_file = file(params.design)
deeptools_design = Channel.fromPath(params.design)
diffbind_design = Channel.fromPath(params.design)
chipseeker_design = Channel.fromPath(params.design)
meme_design = Channel.fromPath(params.design)
index_bams = Channel.fromPath(params.bams)
deeptools_bams = Channel.fromPath(params.bams)
deeptools_peaks = Channel.fromPath(params.peaks)
chipseeker_peaks = Channel.fromPath(params.peaks)
diffbind_bams = Channel.fromPath(params.bams)
diffbind_peaks = Channel.fromPath(params.peaks)
meme_peaks = Channel.fromPath(params.peaks)
process bamindex {
publishDir "$baseDir/output/", mode: 'copy'
input:
file index_bam_files from index_bams
output:
file "*bai" into deeptools_bamindex
file "*bai" into diffbind_bamindex
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module load samtools/intel/1.3
samtools index $index_bam_files
"""
}
process run_deeptools {
publishDir "$baseDir/output", mode: 'copy'
input:
file deeptools_design_file from deeptools_design
file deeptools_bam_files from deeptools_bams.toList()
file deeptools_peak_files from deeptools_peaks.toList()
file deeptools_bam_indexes from deeptools_bamindex.toList()
output:
file "*deeptools*" into deeptools_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module load deeptools/2.3.5
python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}}
"""
}
// Path to an input file, or a pattern for multiple inputs
// Note - $baseDir is the location of this workflow file main.nf
// Define Input variables
params.reads = "$baseDir/../test_data/*.fastq.gz"
params.pairedEnd = false
params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
process run_diffbind {
publishDir "$baseDir/output", mode: 'copy'
input:
file diffbind_design_file from diffbind_design
file diffbind_bam_files from diffbind_bams.toList()
file diffbind_peak_files from diffbind_peaks.toList()
file diffbind_bam_indexes from diffbind_bamindex.toList()
output:
file "diffpeak.design" into diffpeaksdesign_chipseeker
file "diffpeak.design" into diffpeaksdesign_meme
file "*_diffbind.bed" into diffpeaks_meme
file "*_diffbind.bed" into diffpeaks_chipseeker
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
"""
}
// Define List of Files
readsList = Channel
.fromPath( params.reads )
.flatten()
.map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")}
.collectFile( name: 'fileList.tsv', newLine: true )
process run_chipseeker_diffpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file diffpeak_design_file from diffpeaksdesign_chipseeker
file diffpeaks from diffpeaks_chipseeker
output:
file "*chipseeker*" into chipseeker_diffpeak_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath}
"""
}
// Define regular variables
pairedEnd = params.pairedEnd
designFile = params.designFile
process run_chipseeker_originalpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file design_file from chipseeker_design
file chipseeker_peak_files from chipseeker_peaks.toList()
output:
file "*chipseeker*" into chipseeker_originalpeak_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath}
"""
}
process checkDesignFile {
publishDir "$baseDir/output/design", mode: 'copy'
input:
designFile
file readsList
output:
file("design.tsv") into designFilePaths
script:
if (pairedEnd) {
"""
python $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
"""
}
else {
"""
python $baseDir/scripts/check_design.py -d $designFile -f $readsList
"""
}
process run_meme_original {
publishDir "$baseDir/output", mode: 'copy'
input:
file design_meme from meme_design
file meme_peak_files from meme_peaks.toList()
output:
file "*meme*" into meme_original_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module add deeptools/2.3.5
module load meme/4.11.1-gcc-openmpi
python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount}
"""
}
process run_meme_diffpeak {
publishDir "$baseDir/output", mode: 'copy'
input:
file peaks_meme from diffpeaks_meme
file diffpeak_design from diffpeaksdesign_meme
output:
file "*meme*" into meme_diffpeak_output
script:
"""
module load python/2.7.x-anaconda
module load R/3.3.2-gccmkl
module add deeptools/2.3.5
module load meme/4.11.1-gcc-openmpi
python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount}
"""
// Define channel for raw reads
if (pairedEnd) {
rawReads = designFilePaths
.splitCsv(sep: '\t', header: true)
.map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
} else {
rawReads = designFilePaths
.splitCsv(sep: '\t', header: true)
.map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
}
process fastQc {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/", mode: 'copy',
saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"}
input:
set sampleId, reads, biosample, factor, treatment, replicate, controlId from rawReads
output:
file '*_fastqc.{zip,html}' into fastqc_results
script:
"""
python $baseDir/scripts/qc_fastq.py -f $reads
"""
}
#!/usr/bin/env python3
'''Check if design file is correctly formatted and matches files list.'''
import argparse
import logging
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
## SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run QC (TSV format).",
required=True)
parser.add_argument('-f', '--fastq',
help="File with list of fastq files (csv format).",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
def check_design_headers(design, paired):
'''Check if design file conforms to sequencing type.'''
# Default headers
design_template = [
'sample_id',
'biosample',
'factor',
'treatment',
'replicate',
'control_id',
'fastq_read1']
design_headers = list(design.columns.values)
if paired: # paired-end data
design_template.extend(['fastq_read2'])
# Check if headers
logger.info("Running header check.")
missing_headers = set(design_template) - set(design_headers)
if len(missing_headers) > 0:
logger.error('Missing column headers: %s', list(missing_headers))
raise Exception("Missing column headers: %s" % list(missing_headers))
def check_controls(design):
'''Check if design file has the correct control mapping.'''
logger.info("Running control check.")
missing_controls = set(design['control_id']) - set(design['sample_id'])
if len(missing_controls) > 0:
logger.error('Missing control experiments: %s', list(missing_controls))
raise Exception("Missing control experiments: %s" % list(missing_controls))
def check_files(design, fastq, paired):
'''Check if design file has the files found.'''
logger.info("Running file check.")
if paired: # paired-end data
files = list(design['fastq_read1']) + list(design['fastq_read2'])
else: # single-end data
files = design['fastq_read1']
files_found = fastq['name']
missing_files = set(files) - set(files_found)
if len(missing_files) > 0:
logger.error('Missing files from design file: %s', list(missing_files))
raise Exception("Missing files from design file: %s" % list(missing_files))
else:
file_dict = fastq.set_index('name').T.to_dict()
design['fastq_read1'] = design['fastq_read1'] \
.apply(lambda x: file_dict[x]['path'])
if paired: # paired-end data
design['fastq_read2'] = design['fastq_read2'] \
.apply(lambda x: file_dict[x]['path'])
return design
def main():
args = get_args()
# Create a file handler
handler = logging.FileHandler('design.log')
logger.addHandler(handler)
# Read files
design_file = pd.read_csv(args.design, sep='\t')
fastq_file = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
# Check design file
check_design_headers(design_file, args.paired)
check_controls(design_file)
new_design = check_files(design_file, fastq_file, args.paired)
# Write out new design file
new_design.to_csv('design.tsv', header=True, sep='\t', index=False)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
'''QC check of raw .fastq files using FASTQC.'''
import os
import subprocess
import argparse
import shutil
import logging
import sys
import json
EPILOG = '''
For more details:
%(prog)s --help
'''
## SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f', '--fastq',
help="The fastq file to run QC check on.",
nargs='+',
required=True)
args = parser.parse_args()
return args
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
fastqc_path = shutil.which("fastqc")
if fastqc_path:
logger.info('Found fastqc: %s', fastqc_path)
else:
logger.error('Missing fastqc')
raise Exception('Missing fastqc')
def check_qual_fastq(fastq):
'''Run fastqc on 1 or 2 files.'''
qc_command = "fastqc -t -f fastq " + " ".join(fastq)
logger.info("Running fastqc with %s", qc_command)
qual_fastq = subprocess.Popen(qc_command, shell=True)
out, err = qual_fastq.communicate()
def main():
args = get_args()
# Create a file handler
handler = logging.FileHandler('qc.log')
LOGGER.addHandler(handler)
# Check if tools are present
check_tools()
# Run quality checks
check_qual_fastq(args.fastq)
if __name__ == '__main__':
main()
File deleted
File deleted
#!/usr/bin/env python3
import os
import pytest
import pandas as pd
from io import StringIO
import check_design
import sys
DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1
A_1\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.fastq.gz
A_2\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.fastq.gz
B_1\tLiver\tInput\tNone\t1\tB_1\tB_1.fastq.gz
B_2\tLiver\tInput\tNone\t2\tB_2\tB_2.fastq.gz
"""
FASTQ_STRING = """
A_1.fastq.gz\t/path/to/file/A_1.fastq.gz
A_2.fastq.gz\t/path/to/file/A_2.fastq.gz
B_1.fastq.gz\t/path/to/file/B_1.fastq.gz
B_2.fastq.gz\t/path/to/file/B_2.fastq.gz
"""
@pytest.fixture
def design():
design_file = StringIO(DESIGN_STRING)
design_df = pd.read_csv(design_file, sep="\t")
return design_df
@pytest.fixture
def fastq_files():
fastq_file = StringIO(FASTQ_STRING)
fastq_df = pd.read_csv(fastq_file, sep='\t', names=['name', 'path'])
return fastq_df
@pytest.fixture
def design_1(design):
design_df = design.drop('fastq_read1', axis=1)
return design_df
@pytest.fixture
def design_2(design):
# Drop Control B_1
design_df = design.drop(design.index[2])
return design_df
@pytest.fixture
def design_3(design):
# Drop A_2 and B_2 and append as fastq_read2
design_df = design.drop(design.index[[1,3]])
design_df['fastq_read2'] = design.loc[[1,3],'fastq_read1'].values
return design_df
@pytest.fixture
def fastq_files_1(fastq_files):
# Drop B_2.fastq.gz
fastq_df = fastq_files.drop(fastq_files.index[3])
return fastq_df
def test_check_headers_singleend(design_1):
paired = False
with pytest.raises(Exception) as excinfo:
check_design.check_design_headers(design_1, paired)
assert str(excinfo.value) == "Missing column headers: ['fastq_read1']"
def test_check_headers_pairedend(design):
paired = True
with pytest.raises(Exception) as excinfo:
check_design.check_design_headers(design, paired)
assert str(excinfo.value) == "Missing column headers: ['fastq_read2']"
def test_check_controls(design_2):
with pytest.raises(Exception) as excinfo:
check_design.check_controls(design_2)
assert str(excinfo.value) == "Missing control experiments: ['B_1']"
def test_check_files_missing_files(design, fastq_files_1):
paired = False
with pytest.raises(Exception) as excinfo:
new_design = check_design.check_files(design, fastq_files_1, paired)
assert str(excinfo.value) == "Missing files from design file: ['B_2.fastq.gz']"
def test_check_files_output_singleend(design, fastq_files):
paired = False
new_design = check_design.check_files(design, fastq_files, paired)
assert new_design.loc[0,'fastq_read1'] == "/path/to/file/A_1.fastq.gz"
def test_check_files_output_pairedend(design_3, fastq_files):
paired = True
new_design = check_design.check_files(design_3, fastq_files, paired)
assert new_design.loc[0,'fastq_read2'] == "/path/to/file/A_2.fastq.gz"
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment