diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 82f2916c2fe6a9f1c60f4c8d06a9b41e93c2a0d8..74795c41b9035e8a9bf8241ac0fcd54cfcd1449c 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -31,6 +31,14 @@ process { module = ['python/3.6.1-2-anaconda', 'phantompeakqualtools/1.2'] cpus = 32 } + $defineExpDesignFiles { + module = ['python/3.6.1-2-anaconda'] + executor = 'local' + } + $poolAndPsuedoReads { + module = ['python/3.6.1-2-anaconda'] + cpus = 32 + } } params { diff --git a/workflow/main.nf b/workflow/main.nf index 0b8e0d5589210dfb15a34ad58238085b6456448f..aa758a8656dfddd010f782b57296458c5edabdf4 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -10,6 +10,7 @@ params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt" params.genome = 'GRCm38' params.genomes = [] params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false +params.cutoffRatio = 1.2 // Check inputs if( params.bwaIndex ){ @@ -30,6 +31,7 @@ readsList = Channel // Define regular variables pairedEnd = params.pairedEnd designFile = params.designFile +cutoffRatio = params.cutoffRatio process checkDesignFile { @@ -63,11 +65,11 @@ process checkDesignFile { if (pairedEnd) { rawReads = designFilePaths .splitCsv(sep: '\t', header: true) - .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } + .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } } else { rawReads = designFilePaths .splitCsv(sep: '\t', header: true) - .map { row -> [ row.sample_id, [row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } + .map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } } // Trim raw reads using trimgalore @@ -78,11 +80,11 @@ process trimReads { input: - set sampleId, reads, biosample, factor, treatment, replicate, controlId from rawReads + set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from rawReads output: - set sampleId, file('*.fq.gz'), biosample, factor, treatment, replicate, controlId into trimmedReads + set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads file('*trimming_report.txt') into trimgalore_results script: @@ -108,12 +110,12 @@ process alignReads { input: - set sampleId, reads, biosample, factor, treatment, replicate, controlId from trimmedReads + set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from trimmedReads file index from bwaIndex.first() output: - set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads + set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads file '*.srt.bam.flagstat.qc' into mappedReadsStats script: @@ -139,12 +141,12 @@ process filterReads { input: - set sampleId, mapped, biosample, factor, treatment, replicate, controlId from mappedReads + set sampleId, mapped, experimentId, biosample, factor, treatment, replicate, controlId from mappedReads output: - set sampleId, file('*.bam'), file('*.bai'), biosample, factor, treatment, replicate, controlId into dedupReads - set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into convertReads + set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads + set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads file '*flagstat.qc' into dedupReadsStats file '*pbc.qc' into dedupReadsComplexity file '*dup.qc' into dupReads @@ -164,11 +166,11 @@ process filterReads { } -// Define channel collecting new design file +// Define channel collecting dedup reads intp new design file dedupDesign = dedupReads - .map{ sampleId, bam, bai, biosample, factor, treatment, replicate, controlId -> - "$sampleId\t$bam\t$bai\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} - .collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + .map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId -> + "$sampleId\t$bam\t$bai\texperimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} + .collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") // Quality Metrics using deeptools process experimentQC { @@ -199,11 +201,11 @@ process convertReads { input: - set sampleId, deduped, biosample, factor, treatment, replicate, controlId from convertReads + set sampleId, deduped, experimentId, biosample, factor, treatment, replicate, controlId from convertReads output: - set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), biosample, factor, treatment, replicate, controlId into tagReads + set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads script: @@ -228,11 +230,11 @@ process crossReads { input: - set sampleId, seTagAlign, tagAlign, biosample, factor, treatment, replicate, controlId from tagReads + set sampleId, seTagAlign, tagAlign, experimentId, biosample, factor, treatment, replicate, controlId from tagReads output: - set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), biosample, factor, treatment, replicate, controlId into xcorReads + set sampleId, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats script: @@ -249,3 +251,59 @@ process crossReads { } } + +// Define channel collecting tagAlign and xcor into design file +xcorDesign = xcorReads + .map{ sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId -> + "$sampleId\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} + .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + +// Make Experiment design files to be read in for downstream analysis +process defineExpDesignFiles { + + publishDir "$baseDir/output/design", mode: 'copy' + + input: + + file xcorDesign + + output: + + file '*.tsv' into experimentObjs + + script: + + """ + python3 $baseDir/scripts/experiment_design.py -d $xcorDesign + """ + +} + + +// Make Experiment design files to be read in for downstream analysis +process poolAndPsuedoReads { + + publishDir "$baseDir/output/design", mode: 'copy' + + input: + + file experimentObjs + + output: + + file '*.tsv' into experimentPoolObjs + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p + """ + } + else { + """ + python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio + """ + } + +} diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py index 161660e03979faee3561808b130f1e4ea0fa1c31..083151213fe483f7cbd4a3feb76c09d760b212d7 100644 --- a/workflow/scripts/check_design.py +++ b/workflow/scripts/check_design.py @@ -21,6 +21,7 @@ logger.setLevel(logging.INFO) def get_args(): '''Define arguments.''' + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) @@ -98,7 +99,7 @@ def check_replicates(design): unique_replicates = set(replicates) if len(replicates) != len(unique_replicates): duplicated_replicates.append(experiment) - + if len(duplicated_replicates) > 0: logger.error('Duplicate replicates in experiments: %s', list(duplicated_replicates)) raise Exception("Duplicate replicates in experiments: %s" % @@ -136,23 +137,26 @@ def check_files(design, fastq, paired): def main(): args = get_args() + design = args.design + fastq = args.fastq + paired = args.paired # Create a file handler handler = logging.FileHandler('design.log') logger.addHandler(handler) - # Read files - design_file = pd.read_csv(args.design, sep='\t') - fastq_file = pd.read_csv(args.fastq, sep='\t', names=['name', 'path']) + # Read files as dataframes + design_df = pd.read_csv(args.design, sep='\t') + fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path']) # Check design file - check_design_headers(design_file, args.paired) - check_controls(design_file) - check_replicates(design_file) - new_design = check_files(design_file, fastq_file, args.paired) + check_design_headers(design_df, paired) + check_controls(design_df) + check_replicates(design_df) + new_design_df = check_files(design_df, fastq_df, paired) # Write out new design file - new_design.to_csv('design.tsv', header=True, sep='\t', index=False) + new_design_df.to_csv('design.tsv', header=True, sep='\t', index=False) if __name__ == '__main__': diff --git a/workflow/scripts/convert_reads.py b/workflow/scripts/convert_reads.py index d7450475c5be71023c365aa6dfecfd3ddb0ddc7e..62faf7258291ae63ff4eab0dd02ee7bb6ff1fa12 100644 --- a/workflow/scripts/convert_reads.py +++ b/workflow/scripts/convert_reads.py @@ -25,6 +25,7 @@ logger.setLevel(logging.INFO) def get_args(): '''Define arguments.''' + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) diff --git a/workflow/scripts/experiment_design.py b/workflow/scripts/experiment_design.py new file mode 100644 index 0000000000000000000000000000000000000000..f527b46d32f3dda20be3c84a4a8db822e9cb7310 --- /dev/null +++ b/workflow/scripts/experiment_design.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +'''Generate experiment design files for downstream processing.''' + +import argparse +import logging +import pandas as pd + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to make experiemnts (tsv format).", + required=True) + + args = parser.parse_args() + return args + + +def update_controls(design): + '''Update design file to append controls list.''' + + logger.info("Running control file update.") + + file_dict = design[['sample_id', 'tag_align']] \ + .set_index('sample_id').T.to_dict() + + design['control_tag_align'] = design['control_id'] \ + .apply(lambda x: file_dict[x]['tag_align']) + + logger.info("Removing rows that are there own control.") + + design = design[design['control_id'] != design['sample_id']] + + return design + + +def make_experiment_design(design): + '''Make design file by grouping for each experiment''' + + logger.info("Running experiment design generation.") + + for experiment, df_experiment in design.groupby('experiment_id'): + experiment_file = experiment + '.tsv' + df_experiment.to_csv(experiment_file, header=True, sep='\t', index=False) + + +def main(): + args = get_args() + design = args.design + + # Create a file handler + handler = logging.FileHandler('experiment_generation.log') + logger.addHandler(handler) + + # Read files as dataframes + design_df = pd.read_csv(design, sep='\t') + + # Update design file for check_controls + new_design_df = update_controls(design_df) + + # write out experiment design files + make_experiment_design(new_design_df) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/experiment_qc.py b/workflow/scripts/experiment_qc.py index 47a398344fa789fb2a5c5abf159db5375b500efe..6f63b52993b54de042f57968d1cdcc1c21bc595c 100644 --- a/workflow/scripts/experiment_qc.py +++ b/workflow/scripts/experiment_qc.py @@ -25,6 +25,7 @@ logger.setLevel(logging.INFO) def get_args(): '''Define arguments.''' + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) @@ -145,6 +146,7 @@ def check_enrichment(sample_id, control_id, sample_reads, control_reads): def main(): args = get_args() + design = args.design # Create a file handler handler = logging.FileHandler('experiment_qc.log') @@ -154,18 +156,18 @@ def main(): check_tools() # Read files - design_file = pd.read_csv(args.design, sep='\t') + design_df = pd.read_csv(design, sep='\t') # Run correlation - mbs_filename = generate_read_summary(design_file) + mbs_filename = generate_read_summary(design_df) check_correlation(mbs_filename) # Run coverage - check_coverage(design_file) + check_coverage(design_df) # Run enrichment - new_design = update_controls(design_file) - for index, row in new_design.iterrows(): + new_design_df = update_controls(design_df) + for index, row in new_design_df.iterrows(): check_enrichment( row['sample_id'], row['control_id'], diff --git a/workflow/scripts/map_qc.py b/workflow/scripts/map_qc.py index 4451d916126fce6d4683539005a9cdda89c3f50a..3f07068352724bfd86f7513342e373fc4fabc779 100644 --- a/workflow/scripts/map_qc.py +++ b/workflow/scripts/map_qc.py @@ -34,6 +34,7 @@ STRIP_EXTENSIONS = ['.bam', '.srt'] def get_args(): '''Define arguments.''' + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) diff --git a/workflow/scripts/map_reads.py b/workflow/scripts/map_reads.py index ba6bbd0722a49ab09ddd84befbfe0a0ce3e07f19..b7d3144fe8484f2d8939c773229c45b58e283315 100644 --- a/workflow/scripts/map_reads.py +++ b/workflow/scripts/map_reads.py @@ -33,6 +33,7 @@ STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed'] def get_args(): '''Define arguments.''' + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py new file mode 100644 index 0000000000000000000000000000000000000000..2dae5f78b8ffa3e5ee6e8b2e6b04e09ff6a6d019 --- /dev/null +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 + +'''Generate pooled and pseudoreplicate from data.''' + +import argparse +import utils +import logging +import pandas as pd +import numpy as np +import os + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to make experiemnts (tsv format).", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + parser.add_argument('-c', '--cutoff', + help="Cutoff ratio used for choosing controls.", + default=1.2) + + args = parser.parse_args() + return args + + +def check_replicates(design): + '''Check the number of replicates for the experiment.''' + + no_rep = design.shape[0] + + return no_rep + + +def check_controls(design): + '''Check the number of controls for the experiment.''' + + no_controls = len(design.control_tag_align.unique()) + + return no_controls + + +def get_read_count_ratio(design): + '''Get the ratio of read counts for unique controls.''' + + controls = design.control_tag_align.unique() + control_dict = {} + for con in controls: + no_reads = utils.count_lines(con) + control_dict[con] = no_reads + + control_matrix = {c: control_dict for c in controls} + control_df = pd.DataFrame.from_dict(control_matrix) + + control_ratio = control_df.divide(list(control_dict.values()), axis=0) + return control_ratio + + +def pool(tag_files, outfile, paired): + '''Pool files together.''' + + if paired: + file_extension = '.bedpe.gz' + else: + file_extension = '.bedse.gz' + + pooled_filename = outfile + file_extension + + # Merge files + out, err = utils.run_pipe([ + 'gzip -dc %s' % (' '.join(tag_files)), + 'gzip -cn'], + outfile=pooled_filename) + + return pooled_filename + + +def self_psuedoreplication(tag_file, prefix, paired): + '''Make 2 self-psuedoreplicates.''' + + # Get total number of reads + no_lines = utils.count_lines(tag_file) + + # Number of lines to split into + lines_per_rep = (no_lines+1)/2 + + # Make an array of number of psuedoreplicatesfile names + pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz' + for r in [0, 1]} + + # Shuffle and split file into equal parts + # by using the input to seed shuf we ensure multiple runs with the same + # input will produce the same output + # Produces two files named splits_prefix0n, n=1,2 + + splits_prefix = 'temp_split' + + out, err = utils.run_pipe([ + 'gzip -dc %s' % (tag_file), + 'shuf --random-source=%s' % (tag_file), + 'split -d -l %d - %s' % (lines_per_rep, splits_prefix)]) + + # Convert read pairs to reads into standard tagAlign file + + for i, index in enumerate([0, 1]): + string_index = '0' + str(index) + steps = ['cat %s' % (splits_prefix + string_index)] + if paired: + steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) + steps.extend(['gzip -cn']) + out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i]) + os.remove(splits_prefix + string_index) + + return pseudoreplicate_dict + + +def main(): + args = get_args() + paired = args.paired + design = args.design + cutoff_ratio = args.cutoff + + # Create a file handler + handler = logging.FileHandler('experiment_generation.log') + logger.addHandler(handler) + + # Read files as dataframes + design_df = pd.read_csv(design, sep='\t') + + # Get current directory to build paths + cwd = os.getcwd() + + # Check Number of replicates and replicates + no_reps = check_replicates(design_df) + no_unique_controls = check_controls(design_df) + + if no_reps == 1: + logger.info("No other replicate specified " + "so processing as an unreplicated experiment.") + replicated = False + + else: + logger.info("Multiple replicates specified " + "so processing as a replicated experiment.") + replicated = True + + if no_unique_controls == 1 and replicated: + logger.info("Only a single control was specified " + "so using same control for replicates, pool and psuedoreplicates.") + single_control = True + else: + logger.info("Will merge only unique controls for pooled.") + single_control = False + + # Pool the controls for checking + if not single_control: + control_df = get_read_count_ratio(design_df) + control_files = design_df.control_tag_align.unique() + pool_control = pool(control_files, "pool_control", paired) + else: + pool_control = design_df.control_tag_align.unique()[0] + + # Psuedoreplicate and update design accordingly + if not replicated: + + # Duplicate rows and update for pool and psuedoreplicates + experiment_id = design_df.at[0, 'experiment_id'] + replicate = design_df.at[0, 'replicate'] + design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() + design_new_df.set_value(1, 'sample_id', experiment_id + '_pr') + design_new_df.set_value(1, 'replicate', '1_pr') + design_new_df.set_value(2, 'sample_id', experiment_id + '_pr') + design_new_df.set_value(2, 'replicate', '2_pr') + design_new_df.set_value(3, 'sample_id', experiment_id + '_pooled') + design_new_df.set_value(3, 'sample_id', experiment_id + '_pooled') + + # Make 2 self psuedoreplicates + self_pseudoreplicates_dict = {} + for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): + replicate_prefix = experiment_id + '_' + rep + self_pseudoreplicates_dict = \ + self_psuedoreplication(tag_file, replicate_prefix, paired) + + # Update design to include new self pseudo replicates + for rep, pseudorep_file in self_pseudoreplicates_dict.items(): + path_to_file = cwd + '/' + pseudorep_file + replicate = rep + 1 + design_new_df.loc[replicate, 'tag_align'] = path_to_file + + else: + # Make pool of replicates + replicate_files = design_df.tag_align.unique() + experiment_id = design_df.at[0, 'experiment_id'] + pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) + + # Make self psuedoreplicates equivalent to number of replicates + pseudoreplicates_dict = {} + for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): + replicate_prefix = experiment_id + '_' + str(rep) + pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired) + pseudoreplicates_dict[rep] = pr_dict + + # Merge self psuedoreplication for each true replicate + pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict) + pool_pseudoreplicates_dict = {} + for index, row in pseudoreplicates_df.iterrows(): + replicate_id = index + 1 + pr_filename = experiment_id + ".pr" + str(replicate_id) + '.bedse.gz' + pool_replicate = pool(row, pr_filename, False) + pool_pseudoreplicates_dict[replicate_id] = pool_replicate + + design_new_df = design_df + # Check controls against cutoff_ratio + # if so replace with pool_control + path_to_pool_control = cwd + '/' + pool_control + if control_df.values.max() > 1.2: + logger.info("Number of reads in controls differ by " + + " > factor of %f. Using pooled controls." % (cutoff_ratio)) + design_new_df['control_tag_align'] = path_to_pool_control + else: + for index, row in design_new_df.iterrows(): + exp_no_reads = utils.count_lines(row['tag_align']) + con_no_reads = utils.count_lines(row['control_tag_align']) + if con_no_reads < exp_no_reads: + logger.info("Fewer reads in control than experiment." + + "Using pooled controls for replicate %s." + % row['replicate']) + design_new_df.loc[index, 'control_tag_align'] = \ + path_to_pool_control + + # Add in pseudo replicates + tmp_metadata = design_new_df.loc[0].copy() + tmp_metadata['control_tag_align'] = path_to_pool_control + for rep, pseudorep_file in pool_pseudoreplicates_dict.items(): + tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep) + tmp_metadata['replicate'] = str(rep) + '_pr' + tmp_metadata['xcor'] = 'Calculate' + path_to_file = cwd + '/' + pseudorep_file + tmp_metadata['tag_align'] = path_to_file + design_new_df = design_new_df.append(tmp_metadata) + + # Add in pool experiment + tmp_metadata['sample_id'] = experiment_id + '_pooled' + tmp_metadata['replicate'] = 'pooled' + tmp_metadata['xcor'] = 'Calculate' + path_to_file = cwd + '/' + pool_experiment + tmp_metadata['tag_align'] = path_to_file + design_new_df = design_new_df.append(tmp_metadata) + + # Write out new dataframe + design_new_df.to_csv(experiment_id + '_ppr.tsv', + header=True, sep='\t', index=False) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/trim_reads.py b/workflow/scripts/trim_reads.py index f9f2524f942fb356c360696cee70ea80a403cc4b..036c5f700eb61011cce4008230b60c634ea1c582 100644 --- a/workflow/scripts/trim_reads.py +++ b/workflow/scripts/trim_reads.py @@ -22,6 +22,7 @@ logger.setLevel(logging.INFO) def get_args(): '''Define arguments.''' + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) @@ -80,6 +81,8 @@ def trim_reads(fastq, paired): def main(): args = get_args() + fastq = args.fastq + paired = args.paired # Create a file handler handler = logging.FileHandler('trim.log') @@ -89,7 +92,7 @@ def main(): check_tools() # Run trim_reads - trim_reads(args.fastq, args.paired) + trim_reads(fastq, paired) if __name__ == '__main__': diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py index 5162646d56b5281a88cd7195dba784865cffe7f8..2643c6e409938d30453d1bc93359df81651de5fe 100644 --- a/workflow/scripts/utils.py +++ b/workflow/scripts/utils.py @@ -53,3 +53,22 @@ def strip_extensions(filename, extensions): basename = basename.rpartition(extension)[0] or basename return basename + + +def count_lines(filename): + import mimetypes + compressed_mimetypes = [ + "compress", + "bzip2", + "gzip" + ] + mime_type = mimetypes.guess_type(filename)[1] + if mime_type in compressed_mimetypes: + catcommand = 'gzip -dc' + else: + catcommand = 'cat' + out, err = run_pipe([ + '%s %s' % (catcommand, filename), + 'wc -l' + ]) + return int(out) diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py index 88750b6d5e5e128ff7b9a2faf843f0eb165bfd5b..0bf51e6400d05d2ea9cdb978883fed3c765d3100 100644 --- a/workflow/scripts/xcor.py +++ b/workflow/scripts/xcor.py @@ -24,6 +24,7 @@ logger.setLevel(logging.INFO) def get_args(): '''Define arguments.''' + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) diff --git a/workflow/tests/test_experiment_design.py b/workflow/tests/test_experiment_design.py new file mode 100644 index 0000000000000000000000000000000000000000..bfbd3c01148fb2e31e40f0ddcf25a4f208b92caa --- /dev/null +++ b/workflow/tests/test_experiment_design.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import experiment_design +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/design/' + +DESIGN_STRING = """sample_id\ttag_align\txcor\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id +A_1\tA_1.tagAlign.gz\tA\tLiver\tH3K27ac\tNone\t1\tB_1 +A_2\tA_2.tagAlign.gz\tA\tLiver\tH3K27ac\tNone\t2\tB_2 +B_1\tB_1.tagAlign.gz\tB\tLiver\tInput\tNone\t1\tB_1 +B_2\tB_2.tagAlign.gz\tB\tLiver\tInput\tNone\t2\tB_2 +""" + + +@pytest.fixture +def design_tag(): + design_file = StringIO(DESIGN_STRING) + design_df = pd.read_csv(design_file, sep="\t") + return design_df + + +def test_check_update_controls_tag(design_tag): + new_design = experiment_design.update_controls(design_tag) + assert new_design.loc[0, 'control_tag_align'] == "B_1.tagAlign.gz" + + +def test_experiment_design_single_end(): + design_file = os.path.join(test_output_path, 'ENCSR238SGC.tsv') + assert os.path.exists(design_file) + design_df = pd.read_csv(design_file, sep="\t") + assert design_df.shape[0] == 2 + + +def test_experiment_design_paired_end(): + # Do the same thing for paired end data + pass diff --git a/workflow/tests/test_pool_and_psuedoreplicate.py b/workflow/tests/test_pool_and_psuedoreplicate.py new file mode 100644 index 0000000000000000000000000000000000000000..b7a351d94adbbf351d2b6da03575395a09514801 --- /dev/null +++ b/workflow/tests/test_pool_and_psuedoreplicate.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os +import pool_and_psuedoreplicate + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/design/' + +DESIGN_STRING = """sample_id\ttag_align\txcor\tbiosample\tfactor\ttreatment\treplicate\tcontrol_tag_align +A_1\tA_1.bedse.gz\tA_1.cc.qc\tLiver\tH3K27ac\tNone\t1\tB_1.bedse.gz +A_2\tA_2.bedse.gz\tA_2.cc.qc\tLiver\tH3K27ac\tNone\t2\tB_2.bedse.gz +""" + + +@pytest.fixture +def design_experiment(): + design_file = StringIO(DESIGN_STRING) + design_df = pd.read_csv(design_file, sep="\t") + return design_df + + +@pytest.fixture +def design_experiment_2(design_experiment): + # Drop Replicate A_2 + design_df = design_experiment.drop(design_experiment.index[1]) + return design_df + + +@pytest.fixture +def design_experiment_3(design_experiment): + # Update second control to be same as first + design_experiment.loc[1, 'control_tag_align'] = 'B_1.bedse.gz' + return design_experiment + + +def test_check_replicates(design_experiment): + no_reps = pool_and_psuedoreplicate.check_replicates(design_experiment) + assert no_reps == 2 + + +def test_check_replicates_single(design_experiment_2): + no_reps = pool_and_psuedoreplicate.check_replicates(design_experiment_2) + assert no_reps == 1 + + +def test_check_controls(design_experiment): + no_controls = pool_and_psuedoreplicate.check_controls(design_experiment) + assert no_controls == 2 + + +def test_check_controls_single(design_experiment_3): + no_controls = pool_and_psuedoreplicate.check_controls(design_experiment_3) + assert no_controls == 1 + + +def test_pool_and_psuedoreplicate_single_end(): + design_file = os.path.join(test_output_path, 'ENCSR238SGC_ppr.tsv') + assert os.path.exists(design_file) + design_df = pd.read_csv(design_file, sep="\t") + assert design_df.shape[0] == 5 + + +def test_experiment_design_paired_end(): + # Do the same thing for paired end data + pass