diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..113294a5f18d979085b4ad570d8a22a8e4eabd58 --- /dev/null +++ b/.gitignore @@ -0,0 +1,99 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2cbec7d23f3b67cac83af9ee0ccf901d29990172..68ee9ceb5ead4311861ab92c5224a2b7f7fdb651 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,3 +1,7 @@ +before_script: + - module add python/3.6.1-2-anaconda + - pip install --user pytest-pythonpath + test: script: - - module add python/3.6.1-2-anaconda + - pytest diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 5c2f7f547ac2ce94b882b439f88e6668ba5aa0ac..b812c4a85ff956e2f16af1d5b4ac1637cefbe1e3 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -17,7 +17,8 @@ title: 'BICF ChIP-seq Analysis Workflow' # A summary of the workflow package in plain text description: | This is a workflow package for the BioHPC/BICF ChIP-seq workflow system. - It implements a simple ChIP-seq analysis workflow using deepTools, Diffbind, ChipSeeker and MEME-ChIP, visualization application. + It implements a simple ChIP-seq analysis workflow and + visualization application. # ----------------------------------------------------------------------------- # DOCUMENTATION @@ -33,12 +34,13 @@ documentation_files: # NEXTFLOW WORKFLOW CONFIGURATION # ----------------------------------------------------------------------------- -# Remember - The workflow file is always named 'workflow/main.f' +# Remember - The workflow file is always named 'workflow/main.nf' # The workflow must publish all final output into $baseDir -# A list of clueter environment modules that this workflow requires to run. +# A list of cluster environment modules that this workflow requires to run. # Specify versioned module names to ensure reproducability. workflow_modules: + - 'fastqc/0.11.5' - 'deeptools/2.3.5' - 'meme/4.11.1-gcc-openmpi' @@ -76,45 +78,25 @@ workflow_modules: workflow_parameters: - - id: bams + - id: reads type: files required: true description: | - Bam files of all samples - regex: ".*(bam|BAM)" + Fastq files of all samples + regex: "*_{1,2}.fastq.gz" - - id: peaks - type: files - required: true - description: | - Peak files of all samples. Peaks should be sorted by user using either p_value or intensity of the signals.Bed format. - regex: ".*(narrowPeak|broadPeak|bed|BED)" - - - - id: design - type: files - required: true - regex: ".*(csv)" - description: | - A design file listing pairs of sample name and sample group. Must be in csv format - Columns must include: SampleID,Tissue, Factor, Condition, Replicate, Peaks, bamReads, bamControl, ControlID, PeakCaller - - - id: genomepath + - id: singleEnd type: select - choices: - - [ '/project/shared/bicf_workflow_ref/GRCh38', 'human GRCh38'] - - [ '/project/shared/bicf_workflow_ref/GRCh37', 'human GRCh37'] - - [ '/project/shared/bicf_workflow_ref/GRCm38', 'mouse GRCm38'] - required: true - description: | - Reference genome for annotation - - - id: toppeakcount - type: integer required: true - default: -1 + choices: + - [ 'true', 'True'] + - [ 'false', 'False'] description: | - The number of top peaks to use for motif discovery. This program will nott sort peak BED files for you, so please make sure your peak files are already sorted.If want all peaks to be used, use -1. + In single-end sequencing, the sequencer reads a fragment from only one + end to the other, generating the sequence of base pairs. In paired-end + reading it starts at one read, finishes this direction at the specified + read length, and then starts another round of reading from the opposite + end of the fragment. # ----------------------------------------------------------------------------- # SHINY APP CONFIGURATION @@ -132,11 +114,10 @@ vizapp_cran_packages: - shiny - shinyFiles -# # List of any Bioconductor packages, not provided by the modules, that must be made -# available to the vizapp +# List of any Bioconductor packages, not provided by the modules, +# that must be made available to the vizapp vizapp_bioc_packages: - qusage # - ballgown vizapp_github_packages: - js229/Vennerable - diff --git a/index.md b/index.md deleted file mode 100644 index a8b43e02a51783d3e21c842b29655e2d866bf1ab..0000000000000000000000000000000000000000 --- a/index.md +++ /dev/null @@ -1 +0,0 @@ -this is a read me diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000000000000000000000000000000000..c8d98f29942ac24e83b741a0e102714b44230eab --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +python_paths = workflow/scripts diff --git a/test_data/design_ENCSR238SGC_SE.txt b/test_data/design_ENCSR238SGC_SE.txt new file mode 100644 index 0000000000000000000000000000000000000000..80d4b42ac047e37d15036f9638f51e5ef33c6c2e --- /dev/null +++ b/test_data/design_ENCSR238SGC_SE.txt @@ -0,0 +1,5 @@ +sample_id biosample factor treatment replicate control_id fastq_read1 +ENCSR238SGC limb H3K4me1 None 1 ENCSR687ALB ENCFF833BLU.fastq.gz +ENCSR238SGC limb H3K4me1 None 2 ENCSR687ALB ENCFF646LXU.fastq.gz +ENCSR687ALB limb Control None 1 ENCSR687ALB ENCFF524CAC.fastq.gz +ENCSR687ALB limb Control None 2 ENCSR687ALB ENCFF163AJI.fastq.gz diff --git a/test_data/design_ENCSR729LGA_PE.txt b/test_data/design_ENCSR729LGA_PE.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bd9fa22263b465223faa1a8bd5e4f91cbee2334 --- /dev/null +++ b/test_data/design_ENCSR729LGA_PE.txt @@ -0,0 +1,5 @@ +sample_id biosample factor treatment replicate control_id fastq_read1 fastq_read2 +ENCSR729LGA MCF-7 SP1 None 1 ENCSR217LRF ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz +ENCSR729LGA MCF-7 SP1 None 2 ENCSR217LRF ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz +ENCSR217LRF MCF-7 Control None 1 ENCSR217LRF ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz +ENCSR217LRF MCF-7 Control None 1 ENCSR217LRF ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz diff --git a/test_data/fetch_test_data.sh b/test_data/fetch_test_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..a6107cdd49252429c01b54313fb64e3479ee5813 --- /dev/null +++ b/test_data/fetch_test_data.sh @@ -0,0 +1,17 @@ +echo "Downloading Single-end data set Mouse ENCSR238SGC and ENCSR687ALB" +wget https://www.encodeproject.org/files/ENCFF833BLU/@@download/ENCFF833BLU.fastq.gz +wget https://www.encodeproject.org/files/ENCFF646LXU/@@download/ENCFF646LXU.fastq.gz +wget https://www.encodeproject.org/files/ENCFF524CAC/@@download/ENCFF524CAC.fastq.gz +wget https://www.encodeproject.org/files/ENCFF163AJI/@@download/ENCFF163AJI.fastq.gz +echo "Done with Single-end" + +echo "Downloading Paired-end data set Human ENCSR729LGA and ENCSR217LRF" +wget https://www.encodeproject.org/files/ENCFF957SQS/@@download/ENCFF957SQS.fastq.gz +wget https://www.encodeproject.org/files/ENCFF582IOZ/@@download/ENCFF582IOZ.fastq.gz +wget https://www.encodeproject.org/files/ENCFF330MCZ/@@download/ENCFF330MCZ.fastq.gz +wget https://www.encodeproject.org/files/ENCFF293YFE/@@download/ENCFF293YFE.fastq.gz +wget https://www.encodeproject.org/files/ENCFF002DTU/@@download/ENCFF002DTU.fastq.gz +wget https://www.encodeproject.org/files/ENCFF002EFI/@@download/ENCFF002EFI.fastq.gz +wget https://www.encodeproject.org/files/ENCFF002EFG/@@download/ENCFF002EFG.fastq.gz +wget https://www.encodeproject.org/files/ENCFF002DTS/@@download/ENCFF002DTS.fastq.gz +echo "Done with Paired-end" diff --git a/workflow/main.nf b/workflow/main.nf index bb90254ace37a6ba2ac9dc34be79cba7de371d94..ee90c3a58b01b2016861705bf6b297af49f85b5b 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,139 +1,80 @@ #!/usr/bin/env nextflow - params.design="$baseDir/../test_data/samplesheet.csv" - params.bams = "$baseDir/../test_data/*.bam" - params.peaks = "$baseDir/../test_data/*.broadPeak" - params.genomepath="/project/shared/bicf_workflow_ref/GRCh37" - toppeakcount = -1 - design_file = file(params.design) - deeptools_design = Channel.fromPath(params.design) - diffbind_design = Channel.fromPath(params.design) - chipseeker_design = Channel.fromPath(params.design) - meme_design = Channel.fromPath(params.design) - index_bams = Channel.fromPath(params.bams) - deeptools_bams = Channel.fromPath(params.bams) - deeptools_peaks = Channel.fromPath(params.peaks) - chipseeker_peaks = Channel.fromPath(params.peaks) - diffbind_bams = Channel.fromPath(params.bams) - diffbind_peaks = Channel.fromPath(params.peaks) - meme_peaks = Channel.fromPath(params.peaks) - -process bamindex { - publishDir "$baseDir/output/", mode: 'copy' - input: - file index_bam_files from index_bams - output: - file "*bai" into deeptools_bamindex - file "*bai" into diffbind_bamindex - - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module load samtools/intel/1.3 - samtools index $index_bam_files - """ -} -process run_deeptools { - publishDir "$baseDir/output", mode: 'copy' - input: - file deeptools_design_file from deeptools_design - file deeptools_bam_files from deeptools_bams.toList() - file deeptools_peak_files from deeptools_peaks.toList() - file deeptools_bam_indexes from deeptools_bamindex.toList() - output: - file "*deeptools*" into deeptools_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module load deeptools/2.3.5 - python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}} -""" -} +// Path to an input file, or a pattern for multiple inputs +// Note - $baseDir is the location of this workflow file main.nf +// Define Input variables +params.reads = "$baseDir/../test_data/*.fastq.gz" +params.pairedEnd = false +params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt" -process run_diffbind { - publishDir "$baseDir/output", mode: 'copy' - input: - file diffbind_design_file from diffbind_design - file diffbind_bam_files from diffbind_bams.toList() - file diffbind_peak_files from diffbind_peaks.toList() - file diffbind_bam_indexes from diffbind_bamindex.toList() - output: - file "diffpeak.design" into diffpeaksdesign_chipseeker - file "diffpeak.design" into diffpeaksdesign_meme - file "*_diffbind.bed" into diffpeaks_meme - file "*_diffbind.bed" into diffpeaks_chipseeker - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file -""" -} +// Define List of Files +readsList = Channel + .fromPath( params.reads ) + .flatten() + .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")} + .collectFile( name: 'fileList.tsv', newLine: true ) -process run_chipseeker_diffpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file diffpeak_design_file from diffpeaksdesign_chipseeker - file diffpeaks from diffpeaks_chipseeker - output: - file "*chipseeker*" into chipseeker_diffpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath} -""" -} +// Define regular variables +pairedEnd = params.pairedEnd +designFile = params.designFile -process run_chipseeker_originalpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file design_file from chipseeker_design - file chipseeker_peak_files from chipseeker_peaks.toList() - output: - file "*chipseeker*" into chipseeker_originalpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath} -""" -} +process checkDesignFile { + + publishDir "$baseDir/output/design", mode: 'copy' + + input: + + designFile + file readsList + + output: + + file("design.tsv") into designFilePaths + + script: + + if (pairedEnd) { + """ + python $baseDir/scripts/check_design.py -d $designFile -f $readsList -p + """ + } + else { + """ + python $baseDir/scripts/check_design.py -d $designFile -f $readsList + """ + } -process run_meme_original { - publishDir "$baseDir/output", mode: 'copy' - input: - file design_meme from meme_design - file meme_peak_files from meme_peaks.toList() - output: - file "*meme*" into meme_original_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module add deeptools/2.3.5 - module load meme/4.11.1-gcc-openmpi - python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount} -""" } -process run_meme_diffpeak { - publishDir "$baseDir/output", mode: 'copy' - input: - file peaks_meme from diffpeaks_meme - file diffpeak_design from diffpeaksdesign_meme - output: - file "*meme*" into meme_diffpeak_output - script: - """ - module load python/2.7.x-anaconda - module load R/3.3.2-gccmkl - module add deeptools/2.3.5 - module load meme/4.11.1-gcc-openmpi - python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount} -""" +// Define channel for raw reads +if (pairedEnd) { + rawReads = designFilePaths + .splitCsv(sep: '\t', header: true) + .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } +} else { +rawReads = designFilePaths + .splitCsv(sep: '\t', header: true) + .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } } +process fastQc { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/", mode: 'copy', + saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} + + input: + + set sampleId, reads, biosample, factor, treatment, replicate, controlId from rawReads + + output: + + file '*_fastqc.{zip,html}' into fastqc_results + + script: + + """ + python $baseDir/scripts/qc_fastq.py -f $reads + """ +} diff --git a/workflow/scripts/__init__.py b/workflow/scripts/__init__.py deleted file mode 100644 index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000 --- a/workflow/scripts/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py new file mode 100644 index 0000000000000000000000000000000000000000..929a97dc3b12da8ddd254671bdb62ff4a194c063 --- /dev/null +++ b/workflow/scripts/check_design.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +'''Check if design file is correctly formatted and matches files list.''' + +import argparse +import logging +import pandas as pd + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +## SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to run QC (TSV format).", + required=True) + + parser.add_argument('-f', '--fastq', + help="File with list of fastq files (csv format).", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + + +def check_design_headers(design, paired): + '''Check if design file conforms to sequencing type.''' + + # Default headers + design_template = [ + 'sample_id', + 'biosample', + 'factor', + 'treatment', + 'replicate', + 'control_id', + 'fastq_read1'] + + design_headers = list(design.columns.values) + + if paired: # paired-end data + design_template.extend(['fastq_read2']) + + # Check if headers + logger.info("Running header check.") + + missing_headers = set(design_template) - set(design_headers) + + if len(missing_headers) > 0: + logger.error('Missing column headers: %s', list(missing_headers)) + raise Exception("Missing column headers: %s" % list(missing_headers)) + + +def check_controls(design): + '''Check if design file has the correct control mapping.''' + + logger.info("Running control check.") + + missing_controls = set(design['control_id']) - set(design['sample_id']) + + if len(missing_controls) > 0: + logger.error('Missing control experiments: %s', list(missing_controls)) + raise Exception("Missing control experiments: %s" % list(missing_controls)) + + +def check_files(design, fastq, paired): + '''Check if design file has the files found.''' + + logger.info("Running file check.") + + if paired: # paired-end data + files = list(design['fastq_read1']) + list(design['fastq_read2']) + else: # single-end data + files = design['fastq_read1'] + + files_found = fastq['name'] + + missing_files = set(files) - set(files_found) + + if len(missing_files) > 0: + logger.error('Missing files from design file: %s', list(missing_files)) + raise Exception("Missing files from design file: %s" % list(missing_files)) + else: + file_dict = fastq.set_index('name').T.to_dict() + + design['fastq_read1'] = design['fastq_read1'] \ + .apply(lambda x: file_dict[x]['path']) + if paired: # paired-end data + design['fastq_read2'] = design['fastq_read2'] \ + .apply(lambda x: file_dict[x]['path']) + return design + + +def main(): + args = get_args() + + # Create a file handler + handler = logging.FileHandler('design.log') + logger.addHandler(handler) + + # Read files + design_file = pd.read_csv(args.design, sep='\t') + fastq_file = pd.read_csv(args.fastq, sep='\t', names=['name', 'path']) + + # Check design file + check_design_headers(design_file, args.paired) + check_controls(design_file) + new_design = check_files(design_file, fastq_file, args.paired) + + # Write out new design file + new_design.to_csv('design.tsv', header=True, sep='\t', index=False) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/qc_fastq.py b/workflow/scripts/qc_fastq.py new file mode 100755 index 0000000000000000000000000000000000000000..95d817270d95a8bec029551ec4fa15e1f5031ff6 --- /dev/null +++ b/workflow/scripts/qc_fastq.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +'''QC check of raw .fastq files using FASTQC.''' + +import os +import subprocess +import argparse +import shutil +import logging +import sys +import json + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +## SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-f', '--fastq', + help="The fastq file to run QC check on.", + nargs='+', + required=True) + + args = parser.parse_args() + return args + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + fastqc_path = shutil.which("fastqc") + if fastqc_path: + logger.info('Found fastqc: %s', fastqc_path) + else: + logger.error('Missing fastqc') + raise Exception('Missing fastqc') + + +def check_qual_fastq(fastq): + '''Run fastqc on 1 or 2 files.''' + qc_command = "fastqc -t -f fastq " + " ".join(fastq) + + logger.info("Running fastqc with %s", qc_command) + + qual_fastq = subprocess.Popen(qc_command, shell=True) + out, err = qual_fastq.communicate() + + +def main(): + args = get_args() + + # Create a file handler + handler = logging.FileHandler('qc.log') + LOGGER.addHandler(handler) + + # Check if tools are present + check_tools() + + # Run quality checks + check_qual_fastq(args.fastq) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/runDeepTools.pyc b/workflow/scripts/runDeepTools.pyc deleted file mode 100644 index 90a7657d67359ee5b228fa1b5f01d1661b8421ca..0000000000000000000000000000000000000000 Binary files a/workflow/scripts/runDeepTools.pyc and /dev/null differ diff --git a/workflow/scripts/runMemechip.pyc b/workflow/scripts/runMemechip.pyc deleted file mode 100644 index d397028f858912677cfad02e2de1d34f683a954a..0000000000000000000000000000000000000000 Binary files a/workflow/scripts/runMemechip.pyc and /dev/null differ diff --git a/workflow/tests/test_check_design.py b/workflow/tests/test_check_design.py new file mode 100644 index 0000000000000000000000000000000000000000..394d5251b382b682bf758debdedd12aaf5637551 --- /dev/null +++ b/workflow/tests/test_check_design.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +import os +import pytest +import pandas as pd +from io import StringIO +import check_design +import sys + + +DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1 +A_1\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.fastq.gz +A_2\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.fastq.gz +B_1\tLiver\tInput\tNone\t1\tB_1\tB_1.fastq.gz +B_2\tLiver\tInput\tNone\t2\tB_2\tB_2.fastq.gz +""" + +FASTQ_STRING = """ +A_1.fastq.gz\t/path/to/file/A_1.fastq.gz +A_2.fastq.gz\t/path/to/file/A_2.fastq.gz +B_1.fastq.gz\t/path/to/file/B_1.fastq.gz +B_2.fastq.gz\t/path/to/file/B_2.fastq.gz +""" + + +@pytest.fixture +def design(): + design_file = StringIO(DESIGN_STRING) + design_df = pd.read_csv(design_file, sep="\t") + return design_df + + +@pytest.fixture +def fastq_files(): + fastq_file = StringIO(FASTQ_STRING) + fastq_df = pd.read_csv(fastq_file, sep='\t', names=['name', 'path']) + return fastq_df + + +@pytest.fixture +def design_1(design): + design_df = design.drop('fastq_read1', axis=1) + return design_df + + +@pytest.fixture +def design_2(design): + # Drop Control B_1 + design_df = design.drop(design.index[2]) + return design_df + +@pytest.fixture +def design_3(design): + # Drop A_2 and B_2 and append as fastq_read2 + design_df = design.drop(design.index[[1,3]]) + design_df['fastq_read2'] = design.loc[[1,3],'fastq_read1'].values + return design_df + + +@pytest.fixture +def fastq_files_1(fastq_files): + # Drop B_2.fastq.gz + fastq_df = fastq_files.drop(fastq_files.index[3]) + return fastq_df + + +def test_check_headers_singleend(design_1): + paired = False + with pytest.raises(Exception) as excinfo: + check_design.check_design_headers(design_1, paired) + assert str(excinfo.value) == "Missing column headers: ['fastq_read1']" + + +def test_check_headers_pairedend(design): + paired = True + with pytest.raises(Exception) as excinfo: + check_design.check_design_headers(design, paired) + assert str(excinfo.value) == "Missing column headers: ['fastq_read2']" + + +def test_check_controls(design_2): + with pytest.raises(Exception) as excinfo: + check_design.check_controls(design_2) + assert str(excinfo.value) == "Missing control experiments: ['B_1']" + + +def test_check_files_missing_files(design, fastq_files_1): + paired = False + with pytest.raises(Exception) as excinfo: + new_design = check_design.check_files(design, fastq_files_1, paired) + assert str(excinfo.value) == "Missing files from design file: ['B_2.fastq.gz']" + + +def test_check_files_output_singleend(design, fastq_files): + paired = False + new_design = check_design.check_files(design, fastq_files, paired) + assert new_design.loc[0,'fastq_read1'] == "/path/to/file/A_1.fastq.gz" + + +def test_check_files_output_pairedend(design_3, fastq_files): + paired = True + new_design = check_design.check_files(design_3, fastq_files, paired) + assert new_design.loc[0,'fastq_read2'] == "/path/to/file/A_2.fastq.gz"