diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 68ee9ceb5ead4311861ab92c5224a2b7f7fdb651..acb133b5be9f6f5e79c352f6704633fbe716d907 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,7 +1,10 @@ before_script: - module add python/3.6.1-2-anaconda - pip install --user pytest-pythonpath + - module load nextflow/0.24.1-SNAPSHOT + - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/ test: script: + - nextflow run workflow/main.nf - pytest diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index b812c4a85ff956e2f16af1d5b4ac1637cefbe1e3..f8b8dcf0bd0e0684ac9042aeeff55e5f2c7e628a 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -9,7 +9,7 @@ # A unique identifier for the workflow package, text/underscores only name: 'chipseq_analysis_bicf' # Who wrote this? -author: 'Beibei Chen' +author: 'Beibei Chen and Venkat Malladi' # A contact email address for questions email: 'biohpc-help@utsouthwestern.edu' # A more informative title for the workflow package @@ -17,8 +17,7 @@ title: 'BICF ChIP-seq Analysis Workflow' # A summary of the workflow package in plain text description: | This is a workflow package for the BioHPC/BICF ChIP-seq workflow system. - It implements a simple ChIP-seq analysis workflow and - visualization application. + It implements ChIP-seq analysis workflow and visualization application. # ----------------------------------------------------------------------------- # DOCUMENTATION @@ -40,9 +39,11 @@ documentation_files: # A list of cluster environment modules that this workflow requires to run. # Specify versioned module names to ensure reproducability. workflow_modules: - - 'fastqc/0.11.5' - - 'deeptools/2.3.5' - - 'meme/4.11.1-gcc-openmpi' + - 'python/3.6.1-2-anaconda' + - 'trimgalore/0.4.1' + - 'cutadapt/1.9.1' + - 'fastqc/0.11.2' + - 'bwa/intel/0.7.12' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: @@ -82,10 +83,11 @@ workflow_parameters: type: files required: true description: | - Fastq files of all samples - regex: "*_{1,2}.fastq.gz" + One or more input FASTQ files from a ChIP-seq expereiment and a design + file with the link bewetwen the same file name and sample id + regex: ".*(fastq|fq)*" - - id: singleEnd + - id: pairedEnd type: select required: true choices: @@ -98,6 +100,24 @@ workflow_parameters: read length, and then starts another round of reading from the opposite end of the fragment. + - id: design + type: file + required: true + description: | + A design file listing sample id, fastq files, corresponding control id + and additional information about the sample. + + - id: genome + type: select + choices: + - [ 'GRCh38', 'Human GRCh38'] + - [ 'GRCh37', 'Human GRCh37'] + - [ 'GRCh38', 'Mouse GRCh38'] + required: true + description: | + Reference species and genome used for alignment and subsequent analysis. + + # ----------------------------------------------------------------------------- # SHINY APP CONFIGURATION # ----------------------------------------------------------------------------- @@ -114,7 +134,7 @@ vizapp_cran_packages: - shiny - shinyFiles -# List of any Bioconductor packages, not provided by the modules, +# List of any Bioconductor packages, not provided by the modules, # that must be made available to the vizapp vizapp_bioc_packages: - qusage diff --git a/nextflow.config b/nextflow.config deleted file mode 100644 index 7fc857a03a9f33a5aad2a64d6b608b555422204a..0000000000000000000000000000000000000000 --- a/nextflow.config +++ /dev/null @@ -1,5 +0,0 @@ -process.executor='slurm' -process.queue='super' -process.time='5d' - - diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config new file mode 100644 index 0000000000000000000000000000000000000000..c92f9f5ecb8b174f5a6db24c177638466451c90d --- /dev/null +++ b/workflow/conf/biohpc.config @@ -0,0 +1,33 @@ +process { + executor = 'slurm' + queue='super' + + // Process specific configuration + $checkDesignFile { + module = ['python/3.6.1-2-anaconda'] + executor = 'local' + } + $trim_galore { + module = ['python/3.6.1-2-anaconda', 'trimgalore/0.4.1'] + cpus = 32 + } + $alignReads{ + module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3'] + cpus = 32 + } +} + +params { + // Reference file paths on BioHPC + genomes { + 'GRCh38' { bwa = '/project/shared/bicf_workflow_ref/GRCh38' } + 'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' } + 'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' } + } +} + +trace { + enabled = true + file = 'pipeline_trace.txt' + fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' +} diff --git a/workflow/main.nf b/workflow/main.nf index ee90c3a58b01b2016861705bf6b297af49f85b5b..90763c5aa18db0c0417bcc83c6002c11b5b15946 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -7,6 +7,18 @@ params.reads = "$baseDir/../test_data/*.fastq.gz" params.pairedEnd = false params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt" +params.genome = 'GRCm38' +params.genomes = [] +params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false + +// Check inputs +if( params.bwaIndex ){ + bwaIndex = Channel + .fromPath(params.bwaIndex) + .ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" } +} else { + exit 1, "No reference genome specified." +} // Define List of Files readsList = Channel @@ -36,7 +48,7 @@ process checkDesignFile { if (pairedEnd) { """ - python $baseDir/scripts/check_design.py -d $designFile -f $readsList -p + python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p """ } else { @@ -55,14 +67,14 @@ if (pairedEnd) { } else { rawReads = designFilePaths .splitCsv(sep: '\t', header: true) - .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } + .map { row -> [ row.sample_id, [row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } } -process fastQc { +// Trim raw reads using trimgalore +process trimReads { tag "$sampleId-$replicate" - publishDir "$baseDir/output/", mode: 'copy', - saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} + publishDir "$baseDir/output/${task.process}", mode: 'copy' input: @@ -70,11 +82,51 @@ process fastQc { output: - file '*_fastqc.{zip,html}' into fastqc_results + set sampleId, file('*.fq.gz'), biosample, factor, treatment, replicate, controlId into trimmedReads + file('*trimming_report.txt') into trimgalore_results script: - """ - python $baseDir/scripts/qc_fastq.py -f $reads - """ + if (pairedEnd) { + """ + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p + """ + } + else { + """ + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} + """ + } + +} + +// Align trimmed reads using bwa +process alignReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, reads, biosample, factor, treatment, replicate, controlId from trimmedReads + file index from bwaIndex.first() + + output: + + set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads + set file('*.srt.bam.flagstat.qc') + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa -p + """ + } + else { + """ + python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa + """ + } + } diff --git a/workflow/nextflow.config b/workflow/nextflow.config new file mode 100644 index 0000000000000000000000000000000000000000..30e47ea1aea37ed6550cc2944d69d26e69887489 --- /dev/null +++ b/workflow/nextflow.config @@ -0,0 +1,5 @@ +profiles { + standard { + includeConfig 'conf/biohpc.config' + } +} diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py index 929a97dc3b12da8ddd254671bdb62ff4a194c063..acf4f014f2c4ed24d25da9bd5e268ace9bd2fadd 100644 --- a/workflow/scripts/check_design.py +++ b/workflow/scripts/check_design.py @@ -11,7 +11,7 @@ For more details: %(prog)s --help ''' -## SETTINGS +# SETTINGS logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) @@ -57,7 +57,7 @@ def check_design_headers(design, paired): design_headers = list(design.columns.values) - if paired: # paired-end data + if paired: # paired-end data design_template.extend(['fastq_read2']) # Check if headers @@ -79,7 +79,8 @@ def check_controls(design): if len(missing_controls) > 0: logger.error('Missing control experiments: %s', list(missing_controls)) - raise Exception("Missing control experiments: %s" % list(missing_controls)) + raise Exception("Missing control experiments: %s" % + list(missing_controls)) def check_files(design, fastq, paired): @@ -87,9 +88,9 @@ def check_files(design, fastq, paired): logger.info("Running file check.") - if paired: # paired-end data + if paired: # paired-end data files = list(design['fastq_read1']) + list(design['fastq_read2']) - else: # single-end data + else: # single-end data files = design['fastq_read1'] files_found = fastq['name'] @@ -98,13 +99,14 @@ def check_files(design, fastq, paired): if len(missing_files) > 0: logger.error('Missing files from design file: %s', list(missing_files)) - raise Exception("Missing files from design file: %s" % list(missing_files)) + raise Exception("Missing files from design file: %s" % + list(missing_files)) else: file_dict = fastq.set_index('name').T.to_dict() design['fastq_read1'] = design['fastq_read1'] \ .apply(lambda x: file_dict[x]['path']) - if paired: # paired-end data + if paired: # paired-end data design['fastq_read2'] = design['fastq_read2'] \ .apply(lambda x: file_dict[x]['path']) return design diff --git a/workflow/scripts/map_reads.py b/workflow/scripts/map_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..ba6bbd0722a49ab09ddd84befbfe0a0ce3e07f19 --- /dev/null +++ b/workflow/scripts/map_reads.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 + +'''Align reads to reference genome.''' + +import os +import subprocess +import argparse +import shutil +import shlex +import logging +from multiprocessing import cpu_count +import json +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed'] + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-f', '--fastq', + help="The fastq file to run triming on.", + nargs='+', + required=True) + + parser.add_argument('-r', '--reference', + help="The bwa index of the reference genome.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + bwa_path = shutil.which("bwa") + if bwa_path: + logger.info('Found bwa: %s', bwa_path) + else: + logger.error('Missing bwa') + raise Exception('Missing bwa') + + samtools_path = shutil.which("samtools") + if samtools_path: + logger.info('Found samtools: %s', samtools_path) + else: + logger.error('Missing samtools') + raise Exception('Missing samtools') + + +def generate_sa(fastq, reference): + '''Use BWA to generate Suffix Arrays.''' + + fastq_basename = os.path.basename(utils.strip_extensions(fastq, STRIP_EXTENSIONS)) + + bwa_aln_params = '-q 5 -l 32 -k 2' + + sai = '%s.sai' % (fastq_basename) + with open(sai, 'w') as sai_file: + bwa_command = "bwa aln %s -t %d %s %s" \ + % (bwa_aln_params, cpu_count(), + reference, fastq) + + logger.info("Running bwa with %s", bwa_command) + subprocess.check_call(shlex.split(bwa_command), stdout=sai_file) + + return sai + + +def align_se(fastq, sai, reference, fastq_basename): + '''Use BWA to align SE data.''' + + bam_filename = '%s.srt.bam' % (fastq_basename) + + steps = [ + "bwa samse %s %s %s" + % (reference, sai[0], fastq[0]), + "samtools view -@%d -Su -" % (cpu_count()), + "samtools sort -@%d -o %s" + % (cpu_count(), bam_filename)] + + out, err = utils.run_pipe(steps) + if err: + logger.error("samse/samtools error: %s" % (err)) + + return bam_filename + + +def align_pe(fastq, sai, reference, fastq_basename): + '''Use BWA to align PE data.''' + + sam_filename = "%s.sam" % (fastq_basename) + badcigar_filename = "%s.badReads" % (fastq_basename) + bam_filename = '%s.srt.bam' % (fastq_basename) + + # Remove read pairs with bad CIGAR strings and sort by position + steps = [ + "bwa sampe -P %s %s %s %s %s" + % (reference, sai[0], sai[1], + fastq[0], fastq[1]), + "tee %s" % (sam_filename), + r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", + "sort", + "uniq"] + + out, err = utils.run_pipe(steps, badcigar_filename) + if err: + logger.error("sampe error: %s" % (err)) + + steps = [ + "cat %s" % (sam_filename), + "grep -v -F -f %s" % (badcigar_filename), + "samtools view -@%d -Su -" % (cpu_count()), + "samtools sort -@%d -o %s" + % (cpu_count(), bam_filename)] + + out, err = utils.run_pipe(steps) + if err: + logger.error("samtools error: %s" % (err)) + + return bam_filename + + +def main(): + args = get_args() + paired = args.paired + fastq = args.fastq + reference = args.reference + + # Create a file handler + handler = logging.FileHandler('map.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Run Suffix Array generation + sai = [] + for fq in fastq: + sai_filename = generate_sa(fq, reference) + sai.append(sai_filename) + + # Run alignment for either PE or SE + if paired: # paired-end data + fastq_r1_basename = os.path.basename( + utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) + fastq_r2_basename = os.path.basename( + utils.strip_extensions(fastq[1], STRIP_EXTENSIONS)) + fastq_basename = fastq_r1_basename + fastq_r2_basename + + bam_filename = align_pe(fastq, sai, reference, fastq_basename) + + else: + fastq_basename = os.path.basename( + utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) + + bam_filename = align_se(fastq, sai, reference, fastq_basename) + + bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename) + with open(bam_mapstats_filename, 'w') as fh: + subprocess.check_call( + shlex.split("samtools flagstat %s" % (bam_filename)), + stdout=fh) + + # Remove sai files + for sai_file in sai: + os.remove(sai_file) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/qc_fastq.py b/workflow/scripts/qc_fastq.py index 95d817270d95a8bec029551ec4fa15e1f5031ff6..1d4f9270a9282cf20f07331b1f6182863449f574 100755 --- a/workflow/scripts/qc_fastq.py +++ b/workflow/scripts/qc_fastq.py @@ -66,7 +66,7 @@ def main(): # Create a file handler handler = logging.FileHandler('qc.log') - LOGGER.addHandler(handler) + logger.addHandler(handler) # Check if tools are present check_tools() diff --git a/workflow/scripts/trim_reads.py b/workflow/scripts/trim_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..f9f2524f942fb356c360696cee70ea80a403cc4b --- /dev/null +++ b/workflow/scripts/trim_reads.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +'''Trim low quality reads and remove sequences less than 35 base pairs.''' + +import subprocess +import argparse +import shutil +import logging + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-f', '--fastq', + help="The fastq file to run triming on.", + nargs='+', + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + + +def check_tools(): + '''Checks for required componenets on user system.''' + + logger.info('Checking for required libraries and components on this system') + + trimgalore_path = shutil.which("trim_galore") + if trimgalore_path: + logger.info('Found trimgalore: %s', trimgalore_path) + else: + logger.error('Missing trimgalore') + raise Exception('Missing trimgalore') + + cutadapt_path = shutil.which("cutadapt") + if cutadapt_path: + logger.info('Found cutadapt: %s', cutadapt_path) + else: + logger.error('Missing cutadapt') + raise Exception('Missing cutadapt') + + +def trim_reads(fastq, paired): + '''Run trim_galore on 1 or 2 files.''' + + if paired: # paired-end data + trim_params = '--paired -q 25 --illumina --gzip --length 35' + trim_command = "trim_galore %s %s %s " \ + % (trim_params, fastq[0], fastq[1]) + else: + trim_params = '-q 25 --illumina --gzip --length 35' + trim_command = "trim_galore %s %s " \ + % (trim_params, fastq[0]) + + logger.info("Running trim_galore with %s", trim_command) + + trim = subprocess.Popen(trim_command, shell=True) + out, err = trim.communicate() + + +def main(): + args = get_args() + + # Create a file handler + handler = logging.FileHandler('trim.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Run trim_reads + trim_reads(args.fastq, args.paired) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5162646d56b5281a88cd7195dba784865cffe7f8 --- /dev/null +++ b/workflow/scripts/utils.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +'''General utilities.''' + + +import shlex +import logging + + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = True + + +def run_pipe(steps, outfile=None): + # TODO: capture stderr + from subprocess import Popen, PIPE + p = None + p_next = None + first_step_n = 1 + last_step_n = len(steps) + for n, step in enumerate(steps, start=first_step_n): + logger.debug("step %d: %s" % (n, step)) + if n == first_step_n: + if n == last_step_n and outfile: # one-step pipeline with outfile + with open(outfile, 'w') as fh: + print("one step shlex: %s to file: %s" % (shlex.split(step), outfile)) + p = Popen(shlex.split(step), stdout=fh) + break + print("first step shlex to stdout: %s" % (shlex.split(step))) + + p = Popen(shlex.split(step), stdout=PIPE) + elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file + with open(outfile, 'w') as fh: + print("last step shlex: %s to file: %s" % (shlex.split(step), outfile)) + p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh) + p.stdout.close() + p = p_last + else: # handles intermediate steps and, in the case of a pipe to stdout, the last step + print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step))) + p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE) + p.stdout.close() + p = p_next + out, err = p.communicate() + return out, err + + +def strip_extensions(filename, extensions): + '''Strips extensions to get basename of file.''' + + basename = filename + for extension in extensions: + basename = basename.rpartition(extension)[0] or basename + + return basename diff --git a/workflow/tests/test_check_design.py b/workflow/tests/test_check_design.py index 394d5251b382b682bf758debdedd12aaf5637551..a63bc7772068ec265fcd1bc656e3d36e92ffa853 100644 --- a/workflow/tests/test_check_design.py +++ b/workflow/tests/test_check_design.py @@ -1,11 +1,9 @@ #!/usr/bin/env python3 -import os import pytest import pandas as pd from io import StringIO import check_design -import sys DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1 @@ -49,11 +47,12 @@ def design_2(design): design_df = design.drop(design.index[2]) return design_df + @pytest.fixture def design_3(design): # Drop A_2 and B_2 and append as fastq_read2 - design_df = design.drop(design.index[[1,3]]) - design_df['fastq_read2'] = design.loc[[1,3],'fastq_read1'].values + design_df = design.drop(design.index[[1, 3]]) + design_df['fastq_read2'] = design.loc[[1, 3], 'fastq_read1'].values return design_df @@ -94,10 +93,10 @@ def test_check_files_missing_files(design, fastq_files_1): def test_check_files_output_singleend(design, fastq_files): paired = False new_design = check_design.check_files(design, fastq_files, paired) - assert new_design.loc[0,'fastq_read1'] == "/path/to/file/A_1.fastq.gz" + assert new_design.loc[0, 'fastq_read1'] == "/path/to/file/A_1.fastq.gz" def test_check_files_output_pairedend(design_3, fastq_files): paired = True new_design = check_design.check_files(design_3, fastq_files, paired) - assert new_design.loc[0,'fastq_read2'] == "/path/to/file/A_2.fastq.gz" + assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz" diff --git a/workflow/tests/test_map_reads.py b/workflow/tests/test_map_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..d5a83667a39e768c0c66698b298a53f1aa681ce2 --- /dev/null +++ b/workflow/tests/test_map_reads.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import pytest +import os + + +def test_map_reads_singleend(): + aligned_reads_report = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/alignReads/ENCFF646LXU.srt.bam.flagstat.qc' + samtools_report = open(aligned_reads_report).readlines() + assert '80795025 + 0 in total' in samtools_report[0] + assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4] + + +def test_map_reads_pairedend(): + # Do the same thing for paired end data + pass diff --git a/workflow/tests/test_trim_reads.py b/workflow/tests/test_trim_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..e9680f681251d9d9a9db07ce679e5e5d6a0eba2f --- /dev/null +++ b/workflow/tests/test_trim_reads.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +import pytest +import os + + +def test_trim_reads_singleend(): + raw_fastq = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../test_data/ENCFF833BLU.fastq.gz' + trimmed_fastq = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/trimReads/ENCFF833BLU_trimmed.fq.gz' + trimmed_fastq_report = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/trimReads/ENCFF833BLU.fastq.gz_trimming_report.txt' + assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq) + assert os.path.getsize(trimmed_fastq) == 2512853101 + assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4] + + +def test_trim_reads_pairedend(): + # Do the same thing for paired end data + pass diff --git a/workflow/tests/test_utils.py b/workflow/tests/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3bbe48abcabc3a027e978dbea73b2a9d350e794d --- /dev/null +++ b/workflow/tests/test_utils.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +import pytest +import utils + + +STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '.fa', '.fasta'] + + +@pytest.fixture +def steps(): + steps = [] + return steps + + +@pytest.fixture +def steps_1(steps): + design_file = "test_data/design_ENCSR238SGC_SE.txt" + step = [ + "grep H3K4me1 %s " % (design_file)] + return step + + +@pytest.fixture +def steps_2(steps_1): + steps_1.extend([ + "cut -f7" + ]) + return steps_1 + + +def test_run_one_step(steps_1, capsys): + check_output = 'ENCSR238SGC\tlimb\tH3K4me1\tNone\t1\tENCSR687ALB\tENCFF833BLU.fastq.gz'.encode('UTF-8') + out, err = utils.run_pipe(steps_1) + output, errors = capsys.readouterr() + assert "first step shlex to stdout" in output + assert check_output in out + + +def test_run_two_step(steps_2, capsys): + check_output = 'ENCFF833BLU.fastq.gz\nENCFF646LXU.fastq.gz'.encode('UTF-8') + out, err = utils.run_pipe(steps_2) + output, errors = capsys.readouterr() + assert "intermediate step 2 shlex to stdout" in output + assert check_output in out + + +def test_run_last_step_file(steps_2, capsys, tmpdir): + check_output = 'ENCFF833BLU.fastq.gz\nENCFF646LXU.fastq.gz' + tmp_outfile = tmpdir.join('output.txt') + out, err = utils.run_pipe(steps_2, tmp_outfile.strpath) + output, errors = capsys.readouterr() + assert "last step shlex" in output + assert check_output in tmp_outfile.read() + + +def test_strip_extensions(): + filename = utils.strip_extensions('ENCFF833BLU.fastq.gz', STRIP_EXTENSIONS) + assert filename == 'ENCFF833BLU' + + +def test_strip_extensions_not_valid(): + filename = utils.strip_extensions('ENCFF833BLU.not.valid', STRIP_EXTENSIONS) + assert filename == 'ENCFF833BLU.not.valid' + + +def test_strip_extensions_missing_basename(): + filename = utils.strip_extensions('.fastq.gz', STRIP_EXTENSIONS) + assert filename == '.fastq'