diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5c3cefdeca3907c797ba9776141126be2085a191..79370bd0eda7a5926ba2c01514fbceb9941f944d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -19,8 +19,12 @@ single_end_mouse: - nextflow run workflow/main.nf - pytest -m integration - pytest --cov=./workflow/scripts + artifacts: + expire_in: 2 days paired_end_human: stage: integration script: - - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' + - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true + artifacts: + expire_in: 2 days diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 74388811bd5117ed7d8bd6d963882990fee5cd29..647927efa383ced7c7bcf0eed33482f6df562091 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -1,6 +1,6 @@ process { executor = 'slurm' - queue='super' + queue = 'super' // Process specific configuration $checkDesignFile { diff --git a/workflow/main.nf b/workflow/main.nf index aaa3a26622b2f87e1960697e764dd24cc5f748ad..6da4f6dc8b3736d9cf5efd08a1987e9f9794e121 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -127,7 +127,7 @@ process alignReads { if (pairedEnd) { """ - python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa -p + python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p """ } else { @@ -240,7 +240,7 @@ process crossReads { output: - set sampleId, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads + set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats script: @@ -260,9 +260,9 @@ process crossReads { // Define channel collecting tagAlign and xcor into design file xcorDesign = xcorReads - .map{ sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId -> - "$sampleId\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} - .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + .map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId -> + "$sampleId\t$seTagAlign\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} + .collectFile(name:'design_xcor.tsv', seed:"sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") // Make Experiment design files to be read in for downstream analysis process defineExpDesignFiles { diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index 2890ff2f8c81a144fe2687f9f963acd250317357..3481495e69af6d0a734052a4b4f391a917174f95 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -40,6 +40,7 @@ def get_args(): parser.add_argument('-c', '--cutoff', help="Cutoff ratio used for choosing controls.", + type=float, default=1.2) args = parser.parse_args() @@ -97,6 +98,20 @@ def pool(tag_files, outfile, paired): return pooled_filename +def bedpe_to_tagalign(tag_file, outfile): + '''Convert read pairs to reads itno standard tagAlign file.''' + + se_tag_filename = outfile + "bedse.tagAlign.gz" + + # Convert read pairs to reads into standard tagAlign file + tag_steps = ["zcat -f %s" % (tag_file)] + tag_steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) + tag_steps.extend(['gzip -cn']) + out, err = utils.run_pipe(tag_steps, outfile=se_tag_filename) + + return se_tag_filename + + def self_psuedoreplication(tag_file, prefix, paired): '''Make 2 self-psuedoreplicates.''' @@ -182,13 +197,24 @@ def main(): else: pool_control = design_df.control_tag_align.unique()[0] + # if paired_end make tagAlign + if paired: + pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control") + pool_control = pool_control_tmp + # Psuedoreplicate and update design accordingly if not replicated: - # Duplicate rows and update for pool and psuedoreplicates + # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data experiment_id = design_df.at[0, 'experiment_id'] replicate = design_df.at[0, 'replicate'] design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() + + # Update tagAlign with single end data + if paired: + design_new_df['tag_align'] = design_new_df['se_tag_align'] + design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True) + design_new_df['replicate'] = design_new_df['replicate'].astype(str) design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' design_new_df.at[1, 'replicate'] = '1_pr' @@ -199,6 +225,7 @@ def main(): design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled' design_new_df.at[3, 'replicate'] = 'pooled' design_new_df.at[3, 'xcor'] = 'Calculate' + design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align'] # Make 2 self psuedoreplicates self_pseudoreplicates_dict = {} @@ -216,12 +243,20 @@ def main(): # Drop index column design_new_df.drop(labels='index', axis=1, inplace=True) + + else: # Make pool of replicates replicate_files = design_df.tag_align.unique() experiment_id = design_df.at[0, 'experiment_id'] pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) + # If paired change to single End + if paired: + pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled") + else: + pool_experiment_se = pool_experiment + # Make self psuedoreplicates equivalent to number of replicates pseudoreplicates_dict = {} for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): @@ -239,6 +274,10 @@ def main(): pool_pseudoreplicates_dict[replicate_id] = pool_replicate design_new_df = design_df + # Update tagAlign with single end data + if paired: + design_new_df['tag_align'] = design_new_df['se_tag_align'] + design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True) # Check controls against cutoff_ratio # if so replace with pool_control # unless single control was used @@ -259,6 +298,16 @@ def main(): % row['replicate']) design_new_df.loc[index, 'control_tag_align'] = \ path_to_pool_control + else: + if paired: + control = row['control_tag_align'] + control_basename = os.path.basename( + utils.strip_extensions(control, ['.filt.nodup.bedpe.gz'])) + control_tmp = bedpe_to_tagalign(control , "control_basename") + path_to_control = cwd + '/' + control_tmp + design_new_df.loc[index, 'control_tag_align'] = \ + path_to_control + else: path_to_pool_control = pool_control @@ -277,10 +326,11 @@ def main(): tmp_metadata['sample_id'] = experiment_id + '_pooled' tmp_metadata['replicate'] = 'pooled' tmp_metadata['xcor'] = 'Calculate' - path_to_file = cwd + '/' + pool_experiment + path_to_file = cwd + '/' + pool_experiment_se tmp_metadata['tag_align'] = path_to_file design_new_df = design_new_df.append(tmp_metadata) + # Write out new dataframe design_new_df.to_csv(experiment_id + '_ppr.tsv', header=True, sep='\t', index=False)