cf2262ba · e4c84105 · e4c84105 · e4c84105 · e4c84105 · e4c84105
--- a/workflow/scripts/plotProfile.sh
+++ b/workflow/scripts/plotProfile.sh
-#!/bin/bash
-#plotProfile.sh
-
-bws=$(ls *.bw)
-gtf=$(ls *.gtf *.bed)
-
-computeMatrix reference-point \
-	--referencePoint TSS \
-	-S $bws \
-	-R $gtf \
-	--skipZeros \
-	-o computeMatrix.gz
-
-plotProfile -m computeMatrix.gz \
-	-out plotProfile.png \
--- a/workflow/scripts/plot_profile.sh
+++ b/workflow/scripts/plot_profile.sh
+#!/bin/bash
+#plot_profile.sh
+
+script_name="plot_profile.sh"
+
+#Help function
+usage() {
+  echo "-h  --Help documentation for $script_name"
+  echo "-g  --File path to gtf/bed files"
+  echo "Example: $script_name -g 'genome.gtf'"
+  exit 1
+}
+
+
+raise()
+{
+  echo "${1}" >&2
+}
+
+check_tools() {
+  raise "
+   Checking for required libraries and components on this system
+   "
+   deeptools --version &> version_deeptools.txt
+   if [ $? -gt 0 ]
+     then
+      raise "Missing deeptools"
+      return 1
+    fi
+}
+
+compute_matrix() {
+  raise "
+  Computing matrix on ${1} using ${2}
+  "
+
+  computeMatrix reference-point \
+    --referencePoint TSS \
+    -S ${1} \
+    -R ${2} \
+    --skipZeros \
+    -o computeMatrix.gz \
+    -p max/2
+
+  if [ $? -gt 0 ]
+  then
+    raise "Problem building matrix"
+    return 1
+  fi
+}
+
+plot_profile() {
+  raise "
+  Plotting profile
+  "
+
+  plotProfile -m computeMatrix.gz \
+    -out plotProfile.png
+
+  if [ $? -gt 0 ]
+  then
+    raise "Problem plotting"
+    return 1
+  fi
+}
+
+
+run_main() {
+
+  # Parsing options
+  OPTIND=1 # Reset OPTIND
+  while getopts :g:h opt
+      do
+          case $opt in
+              g) gtf=$OPTARG;;
+              h) usage;;
+          esac
+      done
+
+  shift $(($OPTIND -1))
+
+  # Check for mandatory options
+  if [[ -z $gtf ]]; then
+      usage
+  fi
+
+  bws=$(ls *pooled.fc_signal.bw)
+
+  check_tools || exit 1
+
+  compute_matrix "${bws}" "${gtf}" || return 1
+
+  plot_profile || return 1
+
+  raise "ALL COMPLETE"
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]
+then
+  run_main "$@"
+  if [ $? -gt 0 ]
+  then
+    exit 1
+  fi
+fi
--- a/workflow/scripts/pool_and_psuedoreplicate.py
+++ b/workflow/scripts/pool_and_psuedoreplicate.py
@@ -204,84 +204,43 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con
        pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control")
        pool_control = pool_control_tmp

-    if not replicated:

-        # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data
-        experiment_id = design_df.at[0, 'experiment_id']
-        replicate = design_df.at[0, 'replicate']
-        design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index()
-
-        # Update tagAlign with single end data
-        if paired:
-            design_new_df['tag_align'] = design_new_df['se_tag_align']
-        design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
-
-        design_new_df['replicate'] = design_new_df['replicate'].astype(str)
-        design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1'
-        design_new_df.at[1, 'replicate'] = '1_pr'
-        design_new_df.at[1, 'xcor'] = 'Calculate'
-        design_new_df.at[2, 'sample_id'] = experiment_id + '_pr2'
-        design_new_df.at[2, 'replicate'] = '2_pr'
-        design_new_df.at[2, 'xcor'] = 'Calculate'
-        design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled'
-        design_new_df.at[3, 'replicate'] = 'pooled'
-        design_new_df.at[3, 'xcor'] = 'Calculate'
-        design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align']
-
-        # Make 2 self psuedoreplicates
-        self_pseudoreplicates_dict = {}
-        for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
-            replicate_prefix = experiment_id + '_' + str(rep)
-            self_pseudoreplicates_dict = \
-                self_psuedoreplication(tag_file, replicate_prefix, paired)
-
-        # Update design to include new self pseudo replicates
-        for rep, pseudorep_file in self_pseudoreplicates_dict.items():
-            path_to_file = cwd + '/' + pseudorep_file
-            replicate = rep + 1
-            design_new_df.loc[replicate, 'tag_align'] = path_to_file
-
-        # Drop index column
-        design_new_df.drop(labels='index', axis=1, inplace=True)
+    # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data
+    experiment_id = design_df.at[0, 'experiment_id']
+    replicate_files = design_df.tag_align.unique()
+    pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired)
+
+    # Make 2 self psuedoreplicates
+    pseudoreplicates_dict = {}
+    for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
+        replicate_prefix = experiment_id + '_' + str(rep)
+        pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired)
+        pseudoreplicates_dict[rep] = pr_dict
+
+    # Update design to include new self pseudo replicates
+    pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict)
+    pool_pseudoreplicates_dict = {}
+    for index, row in pseudoreplicates_df.iterrows():
+        replicate_id = index + 1
+        pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
+        pool_replicate = pool(row, pr_filename, False)
+        pool_pseudoreplicates_dict[replicate_id] = pool_replicate
+
+    design_new_df = design_df #.loc[np.repeat(design_df.index, 4)].reset_index()
+    # Update tagAlign with single end data
+    if paired:
+        design_new_df['tag_align'] = design_new_df['se_tag_align']
+    design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)

+    # If paired change to single End
+    if paired:
+        pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled")
    else:
-        # Make pool of replicates
-        replicate_files = design_df.tag_align.unique()
-        experiment_id = design_df.at[0, 'experiment_id']
-        pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired)
-
-        # If paired change to single End
-        if paired:
-            pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled")
-        else:
-            pool_experiment_se = pool_experiment
-
-        # Make self psuedoreplicates equivalent to number of replicates
-        pseudoreplicates_dict = {}
-        for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
-            replicate_prefix = experiment_id + '_' + str(rep)
-            pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired)
-            pseudoreplicates_dict[rep] = pr_dict
-
-        # Merge self psuedoreplication for each true replicate
-        pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict)
-        pool_pseudoreplicates_dict = {}
-        for index, row in pseudoreplicates_df.iterrows():
-            replicate_id = index + 1
-            pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
-            pool_replicate = pool(row, pr_filename, False)
-            pool_pseudoreplicates_dict[replicate_id] = pool_replicate
-
-        design_new_df = design_df
-        # Update tagAlign with single end data
-        if paired:
-            design_new_df['tag_align'] = design_new_df['se_tag_align']
-        design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
-        # Check controls against cutoff_ratio
-        # if so replace with pool_control
-        # unless single control was used
-
+        pool_experiment_se = pool_experiment

+    # Check controls against cutoff_ratio
+    # if so replace with pool_control
+    # unless single control was used
    if not single_control:
        path_to_pool_control = cwd + '/' + pool_control
        if control_df.values.max() > cutoff_ratio:
@@ -309,28 +268,31 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con
                                                            path_to_control

    else:
-        path_to_pool_control = cwd + '/' +  pool_control
+        if paired:
+            path_to_pool_control = cwd + '/' + pool_control
+        else:
+            path_to_pool_control = pool_control
        design_new_df['control_tag_align'] = path_to_pool_control

-        # Add in pseudo replicates
-        tmp_metadata = design_new_df.loc[0].copy()
-        tmp_metadata['control_tag_align'] = path_to_pool_control
-        for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
-            tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
-            tmp_metadata['replicate'] = str(rep) + '_pr'
-            tmp_metadata['xcor'] = 'Calculate'
-            path_to_file = cwd + '/' + pseudorep_file
-            tmp_metadata['tag_align'] = path_to_file
-            design_new_df = design_new_df.append(tmp_metadata)
-
-        # Add in pool experiment
-        tmp_metadata['sample_id'] = experiment_id + '_pooled'
-        tmp_metadata['replicate'] = 'pooled'
+    # Add in pseudo replicates
+    tmp_metadata = design_new_df.loc[0].copy()
+    tmp_metadata['control_tag_align'] = path_to_pool_control
+    for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
+        tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
+        tmp_metadata['replicate'] = str(rep) + '_pr'
        tmp_metadata['xcor'] = 'Calculate'
-        path_to_file = cwd + '/' + pool_experiment_se
+        path_to_file = cwd + '/' + pseudorep_file
        tmp_metadata['tag_align'] = path_to_file
        design_new_df = design_new_df.append(tmp_metadata)

+    # Add in pool experiment
+    tmp_metadata['sample_id'] = experiment_id + '_pooled'
+    tmp_metadata['replicate'] = 'pooled'
+    tmp_metadata['xcor'] = 'Calculate'
+    path_to_file = cwd + '/' + pool_experiment_se
+    tmp_metadata['tag_align'] = path_to_file
+    design_new_df = design_new_df.append(tmp_metadata)
+
    return design_new_df


@@ -348,7 +310,7 @@ def main():
    design_df = pd.read_csv(design, sep='\t')

    # Get current directory to build paths
-    cwd = os.getcwd() 
+    cwd = os.getcwd()

    # Check Number of replicates and replicates
    no_reps = check_replicates(design_df)
@@ -358,6 +320,7 @@ def main():
    design_new_df = generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls)

    # Write out new dataframe
+    experiment_id = design_df.at[0, 'experiment_id']
    design_new_df.to_csv(experiment_id + '_ppr.tsv',
                         header=True, sep='\t', index=False)


--- a/workflow/scripts/xcor.py
+++ b/workflow/scripts/xcor.py
@@ -103,14 +103,20 @@ def xcor(tag, paired):
    uncompressed_tag_filename = tag_basename

    # Subsample tagAlign file
-    number_reads = 15000000
+    number_reads = 20000000
    subsampled_tag_filename = \
        tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000)

+    tag_extended = 'cat.tagAlign.gz'
+    out, err = utils.run_pipe([
+        "zcat %s %s %s" %
+        (tag, tag, tag)
+    ], outfile=tag_extended)
+
    steps = [
        'zcat %s' % (tag),
        'grep -v "chrM"',
-        'shuf -n %d --random-source=%s' % (number_reads, tag)]
+        'shuf -n %d --random-source=%s' % (number_reads, tag_extended)]

    if paired:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])

--- a/workflow/tests/plot_profile.bats
+++ b/workflow/tests/plot_profile.bats
+#!/opt/bats/libexec/bats-core/ bats
+
+profile_script="./workflow/scripts/plot_profile.sh"
+
+@test "Test deeptools present" {
+  source ${profile_script}
+  run check_tools
+}
+
+@test "Test deeptools computeMatrix" {
+  source ${profile_script}
+  run compute_matrix test_data/ENCSR238SGC_pooled.fc_signal.bw /project/shared/bicf_workflow_ref/mouse/GRCm38/gencode.vM20.annotation.gtf
+  FILE=computeMatrix.gz
+  if [[ -s "$FILE" ]]; then
+      echo "$FILE exists and not empty"
+  fi
+}
+
+@test "Test deeptools plotProfile" {
+  source ${profile_script}
+  run plot_profile computeMatrix.gz
+  FILE=plotProfile.png
+  if [[ -s "$FILE" ]]; then
+      echo "$FILE exists and not empty"
+  fi
+}
--- a/workflow/tests/test_annotate_peaks.py
+++ b/workflow/tests/test_annotate_peaks.py
@@ -25,6 +25,10 @@ def test_annotation_singleend():
    annotation_file = test_output_path + 'ENCSR238SGC.chipseeker_annotation.tsv'
    assert os.path.exists(annotation_file)
    assert utils.count_lines(annotation_file) >= 149284
+    df = pd.read_csv(annotation_file, sep = "\t", header = 0)
+    print(df.head())
+    #assert df['symbol'].notna().all()
+    assert not(df['symbol'].isnull().values.any())


 @pytest.mark.pairedend
@@ -41,4 +45,8 @@ def test_upsetplot_pairedend():
 def test_annotation_pairedend():
    annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv'
    assert os.path.exists(annotation_file)
-    assert utils.count_lines(annotation_file) >= 25494
+    assert utils.count_lines(annotation_file) >= 25367
+    df = pd.read_csv(annotation_file, sep = "\t", header = 0)
+    print(df.head())
+    #assert df['symbol'].notna().all()
+    assert not(df['symbol'].isnull().values.any()) 
--- a/workflow/tests/test_call_peaks_macs.py
+++ b/workflow/tests/test_call_peaks_macs.py
@@ -3,10 +3,29 @@
 import pytest
 import os
 import utils
+from io import StringIO
+import call_peaks_macs

 test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
                '/../output/callPeaksMACS/'

+test_data_path = os.path.dirname(os.path.abspath(__file__)) + \
+		'/../../test_data/'
+        
+
+@pytest.mark.unit
+def test_fragment_length():
+    experiment = "experiment.tagAlign.gz"
+    control = "control.tagAlign.gz"
+    prefix = 'test'
+    genome_size = 'hs'
+    chrom_sizes = 'genomefile.txt'
+    cross_qc = os.path.join(test_data_path, 'test_cross.qc')
+    with pytest.raises(Exception) as excinfo:
+        call_peaks_macs.call_peaks_macs(experiment, cross_qc, control, prefix, genome_size, chrom_sizes)
+    assert str(excinfo.value) == "Error in cross-correlation analysis: ['0', '20', '33']"
+
+

 @pytest.mark.singleend
 def test_fc_signal_singleend():
@@ -26,7 +45,7 @@ def test_peaks_xls_singleend():
 @pytest.mark.singleend
 def test_peaks_bed_singleend():
    peak_file = test_output_path +  'ENCSR238SGC/1/' + 'ENCLB144FDT.narrowPeak'
-    assert utils.count_lines(peak_file) == 227389
+    assert utils.count_lines(peak_file) == 226738


 @pytest.mark.pairedend
@@ -47,4 +66,4 @@ def test_peaks_xls_pairedend():
 @pytest.mark.pairedend
 def test_peaks_bed_pairedend():
    peak_file = test_output_path + 'ENCSR729LGA/2/' + 'ENCLB568IYX.narrowPeak'
-    assert utils.count_lines(peak_file) == 113821
+    assert utils.count_lines(peak_file) == 112631
--- a/workflow/tests/test_diff_peaks.py
+++ b/workflow/tests/test_diff_peaks.py
@@ -15,7 +15,7 @@ def test_diff_peaks_singleend_single_rep():
    assert os.path.isdir(test_output_path) == False


-@pytest.mark.pairedend
+@pytest.mark.pairedendskip_true
 def test_diff_peaks_pairedend_single_rep():
    assert os.path.isdir(test_output_path) == False

@@ -71,4 +71,4 @@ def test_diffbind_pairedend_single_rep():
        assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed'))
        diffbind_file = test_output_path + 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.csv'
    assert os.path.exists(diffbind_file)
-    assert utils.count_lines(diffbind_file) >= 66201
+    assert utils.count_lines(diffbind_file) >= 65124
--- a/workflow/tests/test_generate_software_references.py
+++ b/workflow/tests/test_generate_software_references.py
@@ -4,6 +4,7 @@ import pytest
 import os
 import utils
 import yaml
+from bs4 import BeautifulSoup

 test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
                '/../output/multiqcReport/'
@@ -21,3 +22,14 @@ def test_software_references_output():
        data_loaded = yaml.load(stream)

    assert len(data_loaded['data'].split('<ul>')) == 19
+
+
+@pytest.mark.singleend
+def test_software_references_html():
+    multiqc_report = os.path.join(test_output_path, 'multiqc_report.html')
+    html_file = open(multiqc_report, 'r')
+    source_code = html_file.read()
+    multiqc_html = BeautifulSoup(source_code, 'html.parser')
+    references = multiqc_html.find(id="mqc-module-section-Software_References")
+    assert references is not None
+    assert len(references.find_all('ul')) == 18
--- a/workflow/tests/test_generate_software_versions.py
+++ b/workflow/tests/test_generate_software_versions.py
@@ -20,4 +20,5 @@ def test_software_versions_output():
    with open(software_versions, 'r') as stream:
        data_loaded = yaml.load(stream)

-    assert  len(data_loaded['data'].split('<dt>')) == 17
+    assert  len(data_loaded['data'].split('<dt>')) == 18
+    assert  'Not Run' not in data_loaded['data'].split('<dt>')[17] 
--- a/workflow/tests/test_overlap_peaks.py
+++ b/workflow/tests/test_overlap_peaks.py
@@ -44,4 +44,10 @@ def test_overlap_peaks_singleend():
 def test_overlap_peaks_pairedend():
    assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.rejected.narrowPeak'))
    peak_file = test_output_path + 'ENCSR729LGA.replicated.narrowPeak'
-    assert utils.count_lines(peak_file) >= 25758
+    assert utils.count_lines(peak_file) >= 25657
+
+@pytest.mark.singlecontrol
+def test_overlap_peaks_singlecontrol():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR000DXB.rejected.narrowPeak'))
+    peak_file = test_output_path + 'ENCSR000DXB.replicated.narrowPeak'
+    assert utils.count_lines(peak_file) >= 35097
--- a/workflow/tests/test_plot_profile.py
+++ b/workflow/tests/test_plot_profile.py
@@ -11,8 +11,9 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
 @pytest.mark.singleend
 def test_plot_singleend():
    assert os.path.exists(os.path.join(test_output_path, 'plotProfile.png'))
-
+    assert os.path.getsize(os.path.join(test_output_path, 'plotProfile.png')) > 0

 @pytest.mark.pairedend
 def test_plot_pairedend():
-    assert os.path.exists(os.path.join(test_output_path, 'computeMatrix.gz'))
+    assert os.path.exists(os.path.join(test_output_path, 'plotProfile.png'))
+    assert os.path.getsize(os.path.join(test_output_path, 'plotProfile.png')) > 0
--- a/workflow/tests/test_pool_and_psuedoreplicate.py
+++ b/workflow/tests/test_pool_and_psuedoreplicate.py
@@ -33,9 +33,12 @@ def design_experiment_2(design_experiment):

 @pytest.fixture
 def design_experiment_3(design_experiment):
-    # Update second control to be same as first
-    design_experiment.loc[1, 'control_tag_align'] = 'B_1.bedse.gz'
-    return design_experiment
+    # Drop Replicate A_2
+    design_df = design_experiment.drop(design_experiment.index[1])
+    # Update to be paired as first
+    design_df.loc[0, 'control_tag_align'] = 'B_1.bedpe.gz'
+    design_df.loc[0, 'tag_align'] = 'A_1.bedpe.gz'
+    return design_df


 @pytest.mark.unit
@@ -71,6 +74,19 @@ def test_single_rep(design_experiment_2):
    shutil.copy(test_design_path + 'B_1.tagAlign.gz', cwd)
    single_rep = pool_and_psuedoreplicate.generate_design('false', 1.2, design_experiment_2, cwd, 1, 1)
    assert single_rep.shape[0] == 4
+    assert len(single_rep['control_tag_align'].unique()) == 2
+    assert 'pool_control.tagAlign.gz' in single_rep['control_tag_align'].unique()[1]
+
+
+@pytest.mark.unit
+def test_single_control(design_experiment_3):
+    cwd = os.getcwd()
+    shutil.copy(test_design_path + 'A_1.bedpe.gz', cwd)
+    shutil.copy(test_design_path + 'B_1.bedpe.gz', cwd)
+    shutil.copy(test_design_path + 'A_1.tagAlign.gz', cwd)
+    single_control = pool_and_psuedoreplicate.generate_design('true', 1.2, design_experiment_3, cwd, 1, 1)
+    assert 'pool_control.tagAlign.gz' in single_control['control_tag_align'].unique()[0]
+

 @pytest.mark.singleend
 def test_pool_and_psuedoreplicate_singleend():

--- a/workflow/tests/test_xcor.py
+++ b/workflow/tests/test_xcor.py
@@ -17,9 +17,9 @@ def test_cross_plot_singleend():
 def test_cross_qc_singleend():
    qc_file = os.path.join(test_output_path,"ENCLB144FDT/ENCLB144FDT.cc.qc")
    df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
-    assert df_xcor[2].iloc[0] == '190,200,210'
-    assert df_xcor[8].iloc[0] == 1.025906
-    assert round(df_xcor[9].iloc[0], 6) == 1.139671
+    assert df_xcor[2].iloc[0] == '185,195,205'
+    assert df_xcor[8].iloc[0] == 1.02454
+    assert df_xcor[9].iloc[0] == 0.8098014


 @pytest.mark.pairedend
@@ -31,6 +31,6 @@ def test_cross_qc_pairedend():
 def test_cross_plot_pairedend():
    qc_file = os.path.join(test_output_path,"ENCLB568IYX/ENCLB568IYX.cc.qc")
    df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
-    assert df_xcor[2].iloc[0] == '220,430,475'
-    assert round(df_xcor[8].iloc[0],6) == 1.060018
-    assert df_xcor[9].iloc[0] == 4.099357
+    assert df_xcor[2].iloc[0] == '215,225,455'
+    assert round(df_xcor[8].iloc[0],6) == 1.056201
+    assert df_xcor[9].iloc[0] == 3.599357
No results found