Add in test data.

624918a8 · Venkat Malladi · 722a8dce · 624918a8 · 624918a8 · 624918a8
Commit 624918a8 authored 6 years ago by Venkat Malladi
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+before_script:
+  - module add  python/3.6.1-2-anaconda
+  - pip install --user pytest-pythonpath pytest-cov
+  - module load nextflow/0.27.6
+  - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/atacseq/*fastq.gz test_data/
+
+stages:
+  - unit
+  - integration
+
+user_configuration:
+  stage: unit
+  script:
+  - pytest -m unit
+
+single_end_mouse:
+  stage: integration
+  script:
+  - nextflow run workflow/main.nf
+  artifacts:
+    expire_in: 2 days
+
+paired_end_human:
+  stage: integration
+  script:
+  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true
+  artifacts:
+    expire_in: 2 days
--- a/test_data/fetch_test_data.sh
+++ b/test_data/fetch_test_data.sh
-echo "Downloading Single-end data set Mouse ENCSR238SGC and ENCSR687ALB"
-wget https://www.encodeproject.org/files/ENCFF833BLU/@@download/ENCFF833BLU.fastq.gz
-wget https://www.encodeproject.org/files/ENCFF646LXU/@@download/ENCFF646LXU.fastq.gz
-wget https://www.encodeproject.org/files/ENCFF524CAC/@@download/ENCFF524CAC.fastq.gz
-wget https://www.encodeproject.org/files/ENCFF163AJI/@@download/ENCFF163AJI.fastq.gz
-echo "Done with Single-end"
-
-echo "Downloading Paired-end data set Human ENCSR729LGA and ENCSR217LRF"
-wget https://www.encodeproject.org/files/ENCFF957SQS/@@download/ENCFF957SQS.fastq.gz
-wget https://www.encodeproject.org/files/ENCFF582IOZ/@@download/ENCFF582IOZ.fastq.gz
-wget https://www.encodeproject.org/files/ENCFF330MCZ/@@download/ENCFF330MCZ.fastq.gz
-wget https://www.encodeproject.org/files/ENCFF293YFE/@@download/ENCFF293YFE.fastq.gz
-wget https://www.encodeproject.org/files/ENCFF002DTU/@@download/ENCFF002DTU.fastq.gz
-wget https://www.encodeproject.org/files/ENCFF002EFI/@@download/ENCFF002EFI.fastq.gz
-wget https://www.encodeproject.org/files/ENCFF002EFG/@@download/ENCFF002EFG.fastq.gz
-wget https://www.encodeproject.org/files/ENCFF002DTS/@@download/ENCFF002DTS.fastq.gz
+echo "Downloading Paired-end data set Mouse ENCSR451NAE"
+wget https://www.encodeproject.org/files/ENCFF833BLU/@@download/ENCFF655OFT.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF646LXU/@@download/ENCFF999SZR.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF524CAC/@@download/ENCFF913PMS.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF163AJI/@@download/ENCFF483MKX.fastq.gz
 echo "Done with Paired-end"
+
+echo "Downloading Single-end data set Human ENCSR265ZXX "
+wget https://www.encodeproject.org/files/ENCFF957SQS/@@download/ENCFF115PAE.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF582IOZ/@@download/ENCFF610JYD.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF330MCZ/@@download/ENCFF124LBK.fastq.gz
+echo "Done with Single-end"
--- a/workflow/tests/__pycache__/test_call_peaks_macs.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_call_peaks_macs.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_check_design.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_check_design.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_convert_reads.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_convert_reads.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_experiment_design.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_experiment_design.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_experiment_qc.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_experiment_qc.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_map_qc.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_map_qc.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_map_reads.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_map_reads.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_overlap_peaks.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_overlap_peaks.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_pool_and_psuedoreplicate.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_pool_and_psuedoreplicate.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_trim_reads.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_trim_reads.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_utils.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_utils.cpython-36-PYTEST.pyc
--- a/workflow/tests/__pycache__/test_xcor.cpython-36-PYTEST.pyc
+++ b/workflow/tests/__pycache__/test_xcor.cpython-36-PYTEST.pyc
--- a/workflow/tests/test_call_peaks_macs.py
+++ b/workflow/tests/test_call_peaks_macs.py
+#!/usr/bin/env python3
+
+import pytest
+import os
+from python_utils import utils
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/callPeaksMACS/'
+
+
+@pytest.mark.integration
+def test_call_peaks_macs_singleend():
+    #assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.fc_signal.bw'))
+    #assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.pvalue_signal.bw'))
+    #peak_file = test_output_path + 'ENCLB144FDT_peaks.narrowPeak'
+    #assert utils.count_lines(peak_file) == 210349
+    pass
+
+@pytest.mark.integration
+def test_call_peaks_macs_pairedend():
+    # Do the same thing for paired end data
+    pass
--- a/workflow/tests/test_check_design.py
+++ b/workflow/tests/test_check_design.py
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+from design_file import check_design
+
+
+DESIGN_STRING = """sample_id\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tfastq_read1
+A_1\tA\tLiver\None\tNone\t1\tA_1.fastq.gz
+A_2\tA\tLiver\None\tNone\t2\tA_2.fastq.gz
+B_1\tB\tLiver\None\tNone\t1\tB_1.fastq.gz
+B_2\tB\tLiver\None\tNone\t2\tB_2.fastq.gz
+"""
+
+FASTQ_STRING = """
+A_1.fastq.gz\t/path/to/file/A_1.fastq.gz
+A_2.fastq.gz\t/path/to/file/A_2.fastq.gz
+B_1.fastq.gz\t/path/to/file/B_1.fastq.gz
+B_2.fastq.gz\t/path/to/file/B_2.fastq.gz
+"""
+
+
+@pytest.fixture
+def design():
+    design_file = StringIO(DESIGN_STRING)
+    design_df = pd.read_csv(design_file, sep="\t")
+    return design_df
+
+
+@pytest.fixture
+def fastq_files():
+    fastq_file = StringIO(FASTQ_STRING)
+    fastq_df = pd.read_csv(fastq_file, sep='\t', names=['name', 'path'])
+    return fastq_df
+
+
+@pytest.fixture
+def design_1(design):
+    design_df = design.drop('fastq_read1', axis=1)
+    return design_df
+
+
+@pytest.fixture
+def design_2(design):
+    # Drop Control B_1
+    design_df = design.drop(design.index[2])
+    return design_df
+
+
+@pytest.fixture
+def design_3(design):
+    # Drop A_2 and B_2 and append as fastq_read2
+    design_df = design.drop(design.index[[1, 3]])
+    design_df['fastq_read2'] = design.loc[[1, 3], 'fastq_read1'].values
+    return design_df
+
+
+@pytest.fixture
+def design_4(design):
+    # Update replicate 2 for experiment B to be 1
+    design.loc[design['sample_id'] == 'B_2', 'replicate'] = 1
+    return design
+
+
+@pytest.fixture
+def fastq_files_1(fastq_files):
+    # Drop B_2.fastq.gz
+    fastq_df = fastq_files.drop(fastq_files.index[3])
+    return fastq_df
+
+
+@pytest.mark.unit
+def test_check_headers_singleend(design_1):
+    paired = False
+    with pytest.raises(Exception) as excinfo:
+        check_design.check_design_headers(design_1, paired)
+    assert str(excinfo.value) == "Missing column headers: ['fastq_read1']"
+
+
+@pytest.mark.unit
+def test_check_headers_pairedend(design):
+    paired = True
+    with pytest.raises(Exception) as excinfo:
+        check_design.check_design_headers(design, paired)
+    assert str(excinfo.value) == "Missing column headers: ['fastq_read2']"
+
+
+@pytest.mark.unit
+def test_check_files_missing_files(design, fastq_files_1):
+    paired = False
+    with pytest.raises(Exception) as excinfo:
+        new_design = check_design.check_files(design, fastq_files_1, paired)
+    assert str(excinfo.value) == "Missing files from design file: ['B_2.fastq.gz']"
+
+
+@pytest.mark.unit
+def test_check_files_output_singleend(design, fastq_files):
+    paired = False
+    new_design = check_design.check_files(design, fastq_files, paired)
+    assert new_design.loc[0, 'fastq_read1'] == "/path/to/file/A_1.fastq.gz"
+
+
+@pytest.mark.unit
+def test_check_files_output_pairedend(design_3, fastq_files):
+    paired = True
+    new_design = check_design.check_files(design_3, fastq_files, paired)
+    assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz"
+
+
+@pytest.mark.unit
+def test_check_replicates(design_4):
+    paired = False
+    with pytest.raises(Exception) as excinfo:
+        new_design = check_design.check_replicates(design_4)
+    assert str(excinfo.value) == "Duplicate replicates in experiments: ['B']"
--- a/workflow/tests/test_convert_reads.py
+++ b/workflow/tests/test_convert_reads.py
+#!/usr/bin/env python3
+
+import pytest
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/convertReads/'
+
+
+@pytest.mark.integration
+def test_convert_reads_singleend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCFF115PAE.filt.nodup.tagAlign.gz'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCFF115PAE.filt.nodup.bedse.gz'))
+
+
+@pytest.mark.integration
+def test_map_qc_pairedend():
+    # Do the same thing for paired end data
+    # Also check that bedpe exists
+    pass
--- a/workflow/tests/test_experiment_design.py
+++ b/workflow/tests/test_experiment_design.py
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+from design_file import experiment_design
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/design/'
+
+DESIGN_STRING = """sample_id\ttag_align\txcor\tbiosample\tfactor\ttreatment\treplicate
+A_1\tA_1.tagAlign.gz\tA\tLiver\None\tNone\t1
+A_2\tA_2.tagAlign.gz\tA\tLiver\None\tNone\t2
+B_1\tB_1.tagAlign.gz\tB\tLiver\None\tNone\t1
+B_2\tB_2.tagAlign.gz\tB\tLiver\None\tNone\t2
+"""
+
+
+@pytest.fixture
+def design_tag():
+    design_file = StringIO(DESIGN_STRING)
+    design_df = pd.read_csv(design_file, sep="\t")
+    return design_df
+
+
+
+@pytest.mark.integration
+def test_experiment_design_single_end():
+    design_file = os.path.join(test_output_path, 'ENCSR265ZXX.tsv')
+    assert os.path.exists(design_file)
+    design_df = pd.read_csv(design_file, sep="\t")
+    assert design_df.shape[0] == 3
+
+
+@pytest.mark.integration
+def test_experiment_design_paired_end():
+    # Do the same thing for paired end data
+    pass
--- a/workflow/tests/test_experiment_qc.py
+++ b/workflow/tests/test_experiment_qc.py
+#!/usr/bin/env python3
+
+import pytest
+import os
+import pandas as pd
+from io import StringIO
+from quality_metrics import experiment_qc
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/experimentQC/'
+
+DESIGN_STRING = """sample_id\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tbam_reads
+A_1\tA\tLiver\None\tNone\t1\tA_1.bam
+A_2\tA\tLiver\None\tNone\t2\tA_2.bam
+B_1\tB\tLiver\None\tNone\t1\tB_1.bam
+B_2\tB\tLiver\None\tNone\t2\tB_2.bam
+"""
+
+
+@pytest.fixture
+def design_bam():
+    design_file = StringIO(DESIGN_STRING)
+    design_df = pd.read_csv(design_file, sep="\t")
+    return design_df
+
+
+@pytest.mark.integration
+def test_experiment_qc_singleend():
+    assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
+    assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png'))
+    assert os.path.exists(os.path.join(test_output_path, 'coverage.png'))
+
+
+@pytest.mark.integration
+def test_experiment_qc_pairedend():
+    # Do the same thing for paired end data
+    pass
--- a/workflow/tests/test_map_qc.py
+++ b/workflow/tests/test_map_qc.py
+#!/usr/bin/env python3
+
+import pytest
+import os
+import pandas as pd
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/filterReads/'
+
+
+@pytest.mark.integration
+def test_map_qc_singleend():
+    #assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam'))
+    #assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai'))
+    #filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc'
+    #samtools_report = open(filtered_reads_report).readlines()
+    #assert '64962570 + 0 in total' in samtools_report[0]
+    #assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4]
+    #library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc'
+    #df_library_complexity = pd.read_csv(library_complexity, sep='\t')
+    #assert  df_library_complexity["NRF"].iloc[0] == 0.926192
+    #assert  df_library_complexity["PBC1"].iloc[0] == 0.926775
+    #assert  df_library_complexity["PBC2"].iloc[0] == 13.706885
+    pass
+
+
+@pytest.mark.integration
+def test_map_qc_pairedend():
+    # Do the same thing for paired end data
+    pass