diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 82f2916c2fe6a9f1c60f4c8d06a9b41e93c2a0d8..da1209423786bce5c7cb745e043a97136226bcd0 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -31,6 +31,10 @@ process { module = ['python/3.6.1-2-anaconda', 'phantompeakqualtools/1.2'] cpus = 32 } + $defineExpDesignFiles { + module = ['python/3.6.1-2-anaconda'] + executor = 'local' + } } params { diff --git a/workflow/main.nf b/workflow/main.nf index 0b8e0d5589210dfb15a34ad58238085b6456448f..e04236a1f4913c40aed881805666ba32b8d61e41 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -164,7 +164,7 @@ process filterReads { } -// Define channel collecting new design file +// Define channel collecting dedup reads intp new design file dedupDesign = dedupReads .map{ sampleId, bam, bai, biosample, factor, treatment, replicate, controlId -> "$sampleId\t$bam\t$bai\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} @@ -232,7 +232,7 @@ process crossReads { output: - set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), biosample, factor, treatment, replicate, controlId into xcorReads + set sampleId, tagAlign, file('*.cc.qc'), biosample, factor, treatment, replicate, controlId into xcorReads set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats script: @@ -249,3 +249,30 @@ process crossReads { } } + +// Define channel collecting tagAlign and xcor into design file +xcorDesign = xcorReads + .map{ sampleId, tagAlign, xcor, biosample, factor, treatment, replicate, controlId -> + "$sampleId\t$tagAlign\t$xcor\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} + .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + +// Make Experiment design files to be read in for downstream analysis +process defineExpDesignFiles { + + publishDir "$baseDir/output/design", mode: 'copy' + + input: + + file xcorDesign + + output: + + file '*.tsv' into experimentObjs + + script: + + """ + python3 $baseDir/scripts/experiment_design.py -d $xcorDesign + """ + +} diff --git a/workflow/scripts/experiment_design.py b/workflow/scripts/experiment_design.py new file mode 100644 index 0000000000000000000000000000000000000000..3a3be270fc11126e8c583ca61af6ae353a3c3303 --- /dev/null +++ b/workflow/scripts/experiment_design.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +'''Generate experiment design files for downstream processing.''' + +import argparse +import logging +import pandas as pd + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to make experiemnts (tsv format).", + required=True) + + args = parser.parse_args() + return args + + +def update_controls(design): + '''Update design file to append controls list.''' + + logger.info("Running control file update.") + + file_dict = design[['sample_id', 'tag_align']] \ + .set_index('sample_id').T.to_dict() + + design['control_tag_align'] = design['control_id'] \ + .apply(lambda x: file_dict[x]['tag_align']) + + logger.info("Removing rows that are there own control.") + + design = design[design['control_id'] != design['sample_id']] + + return design + + +def make_experiment_design(design): + '''Make design file by grouping for each experiment''' + + logger.info("Running experiment design generation.") + + for experiment, df_experiment in design.groupby('experiment_id'): + experiment_file = experiment + '.tsv' + new_design.to_csv(experiment_file, header=True, sep='\t', index=False) + + +def main(): + args = get_args() + + # Create a file handler + handler = logging.FileHandler('experiment_generation.log') + logger.addHandler(handler) + + # Read files + design_file = pd.read_csv(args.design, sep='\t') + + # Update design file for check_controls + new_design = update_controls(design_file) + + # write out experiment design files + make_experiment_design(new_design) + + +if __name__ == '__main__': + main() diff --git a/workflow/tests/test_experiment_design.py b/workflow/tests/test_experiment_design.py new file mode 100644 index 0000000000000000000000000000000000000000..39ab570a409a004594656a47856bf5015759aaa3 --- /dev/null +++ b/workflow/tests/test_experiment_design.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import experiment_design + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/design/' + +DESIGN_STRING = """sample_id\ttag_align\txcor\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id +A_1\tA_1.tagAlign.gz\tA\tLiver\tH3K27ac\tNone\t1\tB_1 +A_2\tA_2.tagAlign.gz\tA\tLiver\tH3K27ac\tNone\t2\tB_2 +B_1\tB_1.tagAlign.gz\tB\tLiver\tInput\tNone\t1\tB_1 +B_2\tB_2.tagAlign.gz\tB\tLiver\tInput\tNone\t2\tB_2 +""" + + +@pytest.fixture +def design_tag(): + design_file = StringIO(DESIGN_STRING) + design_df = pd.read_csv(design_file, sep="\t") + return design_df + + +def test_check_update_controls_tag(design_tag): + new_design = experiment_design.update_controls(design_tag) + assert new_design.loc[0, 'control_tag_align'] == "B_1.tagAlign.gz" + + +def test_experiment_design_single_end(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.tsv')) + + +def test_experiment_design_paired_end(): + # Do the same thing for paired end data + pass