diff --git a/.gitignore b/.gitignore index 113294a5f18d979085b4ad570d8a22a8e4eabd58..eb94944ee1efc2ed6d8571d51ad1bb7245c354ed 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,6 @@ ENV/ # mypy .mypy_cache/ + +# Mac OS +.DS_Store diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a54c5d7d11d173b292452f3906efd258c4b0582b..6d2c02bafa267b607097ebe73283d3b4e6a24c89 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,8 +1,8 @@ before_script: - module add python/3.6.1-2-anaconda - - pip install --user pytest-pythonpath pytest-cov + - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1 - module load nextflow/0.31.0 - - ln -s /work/BICF/s163035/chipseq/*fastq.gz test_data/ + - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/ stages: - unit @@ -17,6 +17,10 @@ user_configuration: single_end_mouse: stage: single + only: + - master + except: + - branches script: - nextflow run workflow/main.nf -resume - pytest -m singleend @@ -25,6 +29,10 @@ single_end_mouse: paired_end_human: stage: single + only: + - branches + except: + - master script: - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true -resume - pytest -m pairedend @@ -33,6 +41,10 @@ paired_end_human: single_end_diff: stage: multiple + only: + - branches + except: + - master script: - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' -resume - pytest -m singlediff @@ -40,6 +52,10 @@ single_end_diff: expire_in: 2 days paired_end_diff: + only: + - master + except: + - branches stage: multiple script: - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true -resume diff --git a/workflow/.DS_Store b/workflow/.DS_Store deleted file mode 100644 index a563988c14d65d2b69eb3b32e875d0f346a3055a..0000000000000000000000000000000000000000 Binary files a/workflow/.DS_Store and /dev/null differ diff --git a/workflow/main.nf b/workflow/main.nf index 6c69edad2ea2dd0dc0af40c704c0e15ff974c932..a8c50272131ced87f134cf63241032ceddda2f98 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -105,12 +105,12 @@ process trimReads { if (pairedEnd) { """ - python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -s $sampleId -p """ } else { """ - python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} -s $sampleId """ } @@ -130,18 +130,18 @@ process alignReads { output: set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads - file '*.srt.bam.flagstat.qc' into mappedReadsStats + file '*.flagstat.qc' into mappedReadsStats script: if (pairedEnd) { """ - python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p + python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -p """ } else { """ - python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa + python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa -s $sampleId """ } @@ -161,9 +161,9 @@ process filterReads { set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads - file '*flagstat.qc' into dedupReadsStats - file '*pbc.qc' into dedupReadsComplexity - file '*dup.qc' into dupReads + file '*.flagstat.qc' into dedupReadsStats + file '*.pbc.qc' into dedupReadsComplexity + file '*.dedup.qc' into dupReads script: @@ -342,6 +342,7 @@ process callPeaksMACS { output: set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks + file '*.xls' into summit script: @@ -368,6 +369,7 @@ peaksDesign = experimentPeaks process consensusPeaks { publishDir "$outDir/${task.process}", mode: 'copy' + publishDir "$outDir/design", mode: 'copy', pattern: '*.{csv|tsv}' input: @@ -393,7 +395,7 @@ process consensusPeaks { // Annotate Peaks process peakAnnotation { - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -414,7 +416,7 @@ process peakAnnotation { // Motif Search Peaks process motifSearch { - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -423,7 +425,7 @@ process motifSearch { output: file "*memechip" into motifSearch - file "sorted-*" into filteredPeaks + file "*narrowPeak" into filteredPeaks script: @@ -439,7 +441,7 @@ uniqueExperimentsList = uniqueExperiments // Calculate Differential Binding Activity process diffPeaks { - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -456,7 +458,6 @@ process diffPeaks { when: noUniqueExperiments > 1 - script: """ Rscript $baseDir/scripts/diff_peaks.R $designDiffPeaks diff --git a/workflow/scripts/annotate_peaks.R b/workflow/scripts/annotate_peaks.R index c15ba698dc3f9d120024426b4d7f57bd24a741a6..4bc8d79296150f64c1a1eed2ecba0b5e33b597d1 100644 --- a/workflow/scripts/annotate_peaks.R +++ b/workflow/scripts/annotate_peaks.R @@ -17,7 +17,7 @@ args <- commandArgs(trailingOnly=TRUE) # Check input args if (length(args) != 2) { - stop("Usage: annotate_peaks.R [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE) + stop("Usage: annotate_peaks.R annotate_design.tsv genome_assembly", call.=FALSE) } design_file <- args[1] diff --git a/workflow/scripts/call_peaks_macs.py b/workflow/scripts/call_peaks_macs.py index 23174e07936ec09ccf494d1c713bd71bb4d72713..17b1414b6236ee5cfda0d1c0694a988ecea237c9 100644 --- a/workflow/scripts/call_peaks_macs.py +++ b/workflow/scripts/call_peaks_macs.py @@ -6,7 +6,6 @@ import os import argparse import shutil import logging -from multiprocessing import cpu_count import utils from xcor import xcor as calculate_xcor @@ -100,6 +99,7 @@ def check_tools(): def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes): + '''Call peaks and signal tracks''' # Extract the fragment length estimate from column 3 of the # cross-correlation scores file @@ -107,7 +107,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) firstline = xcor_fh.readline() frag_lengths = firstline.split()[2] # third column fragment_length = frag_lengths.split(',')[0] # grab first value - logger.info("Fraglen %s" % (fragment_length)) + logger.info("Fraglen %s", fragment_length) # Generate narrow peaks and preliminary signal tracks @@ -119,18 +119,19 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("MACS2 exited with returncode %d" % (returncode)) + logger.info("MACS2 exited with returncode %d", returncode) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. # Remove coordinates outside chromosome sizes - narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix) + int_narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix) + narrowpeak_fn = '%s.narrowPeak' % (prefix) clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn) - steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes), - 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)] + steps = ['slopBed -i %s -g %s -b 0' % (int_narrowpeak_fn, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)] out, err = utils.run_pipe(steps) @@ -161,7 +162,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("MACS2 exited with returncode %d" % (returncode)) + logger.info("MACS2 exited with returncode %d", returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (MACS2 bug) @@ -169,7 +170,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn) fc_signal_fn = "%s.fc_signal.bw" % (prefix) steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes), - 'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)] + 'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)] out, err = utils.run_pipe(steps) @@ -185,7 +186,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + logger.info("bedGraphToBigWig exited with returncode %d", returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" # For -log10(p-value) signal tracks @@ -199,7 +200,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) sval = str(min(float(chip_reads), float(control_reads)) / 1000000) logger.info("chip_reads = %s, control_reads = %s, sval = %s" % - (chip_reads, control_reads, sval)) + (chip_reads, control_reads, sval)) command = 'macs2 bdgcmp ' + \ '-t %s_treat_pileup.bdg ' % (prefix) + \ @@ -216,7 +217,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn) pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix) steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes), - 'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)] + 'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)] out, err = utils.run_pipe(steps) @@ -232,12 +233,13 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + logger.info("bedGraphToBigWig exited with returncode %d", returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" # Remove temporary files os.remove(clipped_narrowpeak_fn) os.remove(rescaled_narrowpeak_fn) + os.remove(int_narrowpeak_fn) def main(): diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py index 083151213fe483f7cbd4a3feb76c09d760b212d7..35af1f469826933578ba0df5694f45644836b8b2 100644 --- a/workflow/scripts/check_design.py +++ b/workflow/scripts/check_design.py @@ -72,6 +72,46 @@ def check_design_headers(design, paired): raise Exception("Missing column headers: %s" % list(missing_headers)) +def check_samples(design): + '''Check if design file has the correct sample name mapping.''' + + logger.info("Running sample check.") + + samples = design.groupby('sample_id') \ + .apply(list) + + malformated_samples = [] + chars = set('-.') + for sample in samples.index.values: + if(any(char.isspace() for char in sample) | any((char in chars) for char in sample)): + malformated_samples.append(sample) + + if len(malformated_samples) > 0: + logger.error('Malformed samples from design file: %s', list(malformated_samples)) + raise Exception("Malformed samples from design file: %s" % + list(malformated_samples)) + + +def check_experiments(design): + '''Check if design file has the correct experiment name mapping.''' + + logger.info("Running experiment check.") + + experiments = design.groupby('experiment_id') \ + .apply(list) + + malformated_experiments = [] + chars = set('-.') + for experiment in experiments.index.values: + if(any(char.isspace() for char in experiment) | any((char in chars) for char in experiment)): + malformated_experiments.append(experiment) + + if len(malformated_experiments) > 0: + logger.error('Malformed experiment from design file: %s', list(malformated_experiments)) + raise Exception("Malformed experiment from design file: %s" % + list(malformated_experiments)) + + def check_controls(design): '''Check if design file has the correct control mapping.''' @@ -146,8 +186,8 @@ def main(): logger.addHandler(handler) # Read files as dataframes - design_df = pd.read_csv(args.design, sep='\t') - fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path']) + design_df = pd.read_csv(design, sep='\t') + fastq_df = pd.read_csv(fastq, sep='\t', names=['name', 'path']) # Check design file check_design_headers(design_df, paired) diff --git a/workflow/scripts/convert_reads.py b/workflow/scripts/convert_reads.py index 2c39680f97fcdc95eaee94a6d27cb5511e05ce95..bba7c6f16e49b7ea608d50546ec0b5af21ed4448 100644 --- a/workflow/scripts/convert_reads.py +++ b/workflow/scripts/convert_reads.py @@ -91,8 +91,7 @@ def convert_mapped_pe(bam, bam_basename): out, err = utils.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename), - "gzip -nc"], - outfile=bedpe_filename) + "gzip -nc"], outfile=bedpe_filename) def main(): @@ -109,7 +108,7 @@ def main(): # Convert PE or SE to tagAlign bam_basename = os.path.basename( - utils.strip_extensions(bam, ['.bam'])) + utils.strip_extensions(bam, ['.bam', '.dedup'])) tag_filename = bam_basename + '.tagAlign.gz' convert_mapped(bam, tag_filename) @@ -117,7 +116,7 @@ def main(): if paired: # paired-end data convert_mapped_pe(bam, bam_basename) else: - bedse_filename = bam_basename + ".bedse.gz" + bedse_filename = bam_basename + ".bedse.gz" shutil.copy(tag_filename, bedse_filename) diff --git a/workflow/scripts/diff_peaks.R b/workflow/scripts/diff_peaks.R index 76caa95495c7cae1d014530e78e1e1e3c4af09dc..aae1d24751f24396fff6fe712c28be27e36b0db3 100644 --- a/workflow/scripts/diff_peaks.R +++ b/workflow/scripts/diff_peaks.R @@ -8,7 +8,7 @@ args <- commandArgs(trailingOnly=TRUE) # Check input args if (length(args) != 1) { - stop("Usage: diff_peaks.R [ annotate_design.tsv ] ", call.=FALSE) + stop("Usage: diff_peaks.R annotate_design.tsv ", call.=FALSE) } # Build DBA object from design file diff --git a/workflow/scripts/experiment_qc.py b/workflow/scripts/experiment_qc.py index 34bce3a1516895044d32a9e8853b670b03255361..466f84795f7c2783f68c166a8c53dcc268fd19ff 100644 --- a/workflow/scripts/experiment_qc.py +++ b/workflow/scripts/experiment_qc.py @@ -7,8 +7,9 @@ import argparse import logging import subprocess import shutil -import pandas as pd from multiprocessing import cpu_count +import pandas as pd + EPILOG = ''' For more details: @@ -196,11 +197,11 @@ def main(): new_design_df = update_controls(design_df) for index, row in new_design_df.iterrows(): check_enrichment( - row['sample_id'], - row['control_id'], - row['bam_reads'], - row['control_reads'], - extension) + row['sample_id'], + row['control_id'], + row['bam_reads'], + row['control_reads'], + extension) if __name__ == '__main__': diff --git a/workflow/scripts/map_qc.py b/workflow/scripts/map_qc.py index 4cc549fa1652da3403b0d527b9127273b58e465c..920e0090a3fe69d50dd014e8541b5b9916cabecb 100644 --- a/workflow/scripts/map_qc.py +++ b/workflow/scripts/map_qc.py @@ -106,7 +106,7 @@ def filter_mapped_pe(bam, bam_basename): # Will produce name sorted BAM "samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)]) if err: - logger.error("samtools filter error: %s" % (err)) + logger.error("samtools filter error: %s", err) # Remove orphan reads (pair was removed) # and read pairs mapping to different chromosomes @@ -136,7 +136,7 @@ def filter_mapped_se(bam, bam_basename): # not primary alignment, reads failing platform # Remove low MAPQ reads # Obtain name sorted BAM file - with open(filt_bam_filename, 'w') as fh: + with open(filt_bam_filename, 'w') as temp_file: samtools_filter_command = ( "samtools view -F 1804 -q 30 -b %s" % (bam) @@ -144,7 +144,7 @@ def filter_mapped_se(bam, bam_basename): logger.info(samtools_filter_command) subprocess.check_call( shlex.split(samtools_filter_command), - stdout=fh) + stdout=temp_file) return filt_bam_filename @@ -153,11 +153,11 @@ def dedup_mapped(bam, bam_basename, paired): '''Use sambamba and samtools to remove duplicates.''' # Markduplicates - dup_file_qc_filename = bam_basename + ".dup.qc" + dup_file_qc_filename = bam_basename + ".dedup.qc" tmp_dup_mark_filename = bam_basename + ".dupmark.bam" sambamba_params = "--hash-table-size=17592186044416" + \ " --overflow-list-size=20000000 --io-buffer-size=256" - with open(dup_file_qc_filename, 'w') as fh: + with open(dup_file_qc_filename, 'w') as temp_file: sambamba_markdup_command = ( "sambamba markdup -t %d %s --tmpdir=%s %s %s" % (cpu_count(), sambamba_params, os.getcwd(), bam, tmp_dup_mark_filename) @@ -165,11 +165,11 @@ def dedup_mapped(bam, bam_basename, paired): logger.info(sambamba_markdup_command) subprocess.check_call( shlex.split(sambamba_markdup_command), - stderr=fh) + stderr=temp_file) # Remove duplicates - final_bam_prefix = bam_basename + ".filt.nodup" + final_bam_prefix = bam_basename + ".dedup" final_bam_filename = final_bam_prefix + ".bam" if paired: # paired-end data @@ -179,11 +179,11 @@ def dedup_mapped(bam, bam_basename, paired): samtools_dedupe_command = \ "samtools view -F 1804 -b %s" % (tmp_dup_mark_filename) - with open(final_bam_filename, 'w') as fh: + with open(final_bam_filename, 'w') as temp_file: logger.info(samtools_dedupe_command) subprocess.check_call( shlex.split(samtools_dedupe_command), - stdout=fh) + stdout=temp_file) # Index final bam file sambamba_index_command = \ @@ -192,12 +192,12 @@ def dedup_mapped(bam, bam_basename, paired): subprocess.check_output(shlex.split(sambamba_index_command)) # Generate mapping statistics - final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc" - with open(final_bam_file_mapstats_filename, 'w') as fh: + mapstats_filename = final_bam_prefix + ".flagstat.qc" + with open(mapstats_filename, 'w') as temp_file: flagstat_command = "sambamba flagstat -t %d %s" \ % (cpu_count(), final_bam_filename) logger.info(flagstat_command) - subprocess.check_call(shlex.split(flagstat_command), stdout=fh) + subprocess.check_call(shlex.split(flagstat_command), stdout=temp_file) os.remove(bam) return tmp_dup_mark_filename @@ -206,7 +206,7 @@ def dedup_mapped(bam, bam_basename, paired): def compute_complexity(bam, paired, bam_basename): '''Calculate library complexity .''' - pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc" + pbc_file_qc_filename = bam_basename + ".pbc.qc" tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename) # Sort by name @@ -223,12 +223,12 @@ def compute_complexity(bam, paired, bam_basename): # PBC1=OnePair/Distinct [tab] # PBC2=OnePair/TwoPair pbc_headers = ['TotalReadPairs', - 'DistinctReadPairs', - 'OneReadPair', - 'TwoReadPairs', - 'NRF', - 'PBC1', - 'PBC2'] + 'DistinctReadPairs', + 'OneReadPair', + 'TwoReadPairs', + 'NRF', + 'PBC1', + 'PBC2'] if paired: steps = [ @@ -247,11 +247,11 @@ def compute_complexity(bam, paired, bam_basename): ]) out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename) if err: - logger.error("PBC file error: %s" % (err)) + logger.error("PBC file error: %s", err) # Add headers pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None, - names=pbc_headers) + names=pbc_headers) pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False) os.remove(bam) os.remove(bam + '.bai') diff --git a/workflow/scripts/map_reads.py b/workflow/scripts/map_reads.py index b7d3144fe8484f2d8939c773229c45b58e283315..a1f8161cb4ee71885719d954e9ddf25428499d11 100644 --- a/workflow/scripts/map_reads.py +++ b/workflow/scripts/map_reads.py @@ -9,7 +9,6 @@ import shutil import shlex import logging from multiprocessing import cpu_count -import json import utils EPILOG = ''' @@ -33,7 +32,7 @@ STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed'] def get_args(): '''Define arguments.''' - + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) @@ -47,6 +46,10 @@ def get_args(): help="The bwa index of the reference genome.", required=True) + parser.add_argument('-s', '--sample', + help="The name of the sample.", + required=True) + parser.add_argument('-p', '--paired', help="True/False if paired-end or single end.", default=False, @@ -101,7 +104,7 @@ def generate_sa(fastq, reference): def align_se(fastq, sai, reference, fastq_basename): '''Use BWA to align SE data.''' - bam_filename = '%s.srt.bam' % (fastq_basename) + bam_filename = '%s.bam' % (fastq_basename) steps = [ "bwa samse %s %s %s" @@ -112,7 +115,7 @@ def align_se(fastq, sai, reference, fastq_basename): out, err = utils.run_pipe(steps) if err: - logger.error("samse/samtools error: %s" % (err)) + logger.error("samse/samtools error: %s", err) return bam_filename @@ -122,21 +125,21 @@ def align_pe(fastq, sai, reference, fastq_basename): sam_filename = "%s.sam" % (fastq_basename) badcigar_filename = "%s.badReads" % (fastq_basename) - bam_filename = '%s.srt.bam' % (fastq_basename) + bam_filename = '%s.bam' % (fastq_basename) # Remove read pairs with bad CIGAR strings and sort by position steps = [ - "bwa sampe -P %s %s %s %s %s" - % (reference, sai[0], sai[1], - fastq[0], fastq[1]), - "tee %s" % (sam_filename), - r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", - "sort", - "uniq"] + "bwa sampe -P %s %s %s %s %s" + % (reference, sai[0], sai[1], + fastq[0], fastq[1]), + "tee %s" % (sam_filename), + r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", + "sort", + "uniq"] out, err = utils.run_pipe(steps, badcigar_filename) if err: - logger.error("sampe error: %s" % (err)) + logger.error("sampe error: %s", err) steps = [ "cat %s" % (sam_filename), @@ -147,7 +150,7 @@ def align_pe(fastq, sai, reference, fastq_basename): out, err = utils.run_pipe(steps) if err: - logger.error("samtools error: %s" % (err)) + logger.error("samtools error: %s", err) return bam_filename @@ -157,6 +160,7 @@ def main(): paired = args.paired fastq = args.fastq reference = args.reference + sample = args.sample # Create a file handler handler = logging.FileHandler('map.log') @@ -167,31 +171,25 @@ def main(): # Run Suffix Array generation sai = [] - for fq in fastq: - sai_filename = generate_sa(fq, reference) + for fastq_file in fastq: + sai_filename = generate_sa(fastq_file, reference) sai.append(sai_filename) + # Make file basename + fastq_basename = sample + # Run alignment for either PE or SE if paired: # paired-end data - fastq_r1_basename = os.path.basename( - utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) - fastq_r2_basename = os.path.basename( - utils.strip_extensions(fastq[1], STRIP_EXTENSIONS)) - fastq_basename = fastq_r1_basename + fastq_r2_basename - bam_filename = align_pe(fastq, sai, reference, fastq_basename) else: - fastq_basename = os.path.basename( - utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) - bam_filename = align_se(fastq, sai, reference, fastq_basename) - bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename) - with open(bam_mapstats_filename, 'w') as fh: + bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename) + with open(bam_mapstats_filename, 'w') as temp_file: subprocess.check_call( shlex.split("samtools flagstat %s" % (bam_filename)), - stdout=fh) + stdout=temp_file) # Remove sai files for sai_file in sai: diff --git a/workflow/scripts/motif_search.py b/workflow/scripts/motif_search.py index 9feac8c39201af96700f69370a70b24a7111a5fc..02e316f67a7bdaf26541ca864ceebfcac50e120a 100644 --- a/workflow/scripts/motif_search.py +++ b/workflow/scripts/motif_search.py @@ -3,16 +3,13 @@ '''Call Motifs on called peaks.''' import os -import sys -import re -from re import sub -import string import argparse import logging -import subprocess +import shutil +from multiprocessing import Pool import pandas as pd import utils -from multiprocessing import Pool + EPILOG = ''' For more details: @@ -27,6 +24,13 @@ logger.propagate = False logger.setLevel(logging.INFO) +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.narrowPeak', '.replicated'] + + def get_args(): '''Define arguments.''' @@ -51,19 +55,47 @@ def get_args(): # Functions +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + meme_path = shutil.which("meme") + if meme_path: + logger.info('Found meme: %s', meme_path) + else: + logger.error('Missing meme') + raise Exception('Missing meme') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + def run_wrapper(args): - motif_search(*args) + motif_search(*args) + def motif_search(filename, genome, experiment, peak): + '''Run motif serach on peaks.''' + + file_basename = os.path.basename( + utils.strip_extensions(filename, STRIP_EXTENSIONS)) - file_basename = os.path.basename(filename) - sorted_fn = 'sorted-%s' % (file_basename) out_fa = '%s.fa' % (experiment) out_motif = '%s_memechip' % (experiment) # Sort Bed file and limit number of peaks if peak == -1: peak = utils.count_lines(filename) + peak_no = 'all' + else: + peak_no = peak + + sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak) out, err = utils.run_pipe([ 'sort -k %dgr,%dgr %s' % (5, 5, filename), @@ -73,9 +105,16 @@ def motif_search(filename, genome, experiment, peak): out, err = utils.run_pipe([ 'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)]) + if err: + logger.error("bedtools error: %s", err) + + #Call memechip out, err = utils.run_pipe([ 'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)]) + if err: + logger.error("meme-chip error: %s", err) + def main(): args = get_args() @@ -83,14 +122,22 @@ def main(): genome = args.genome peak = args.peak + # Create a file handler + handler = logging.FileHandler('motif.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + # Read files design_df = pd.read_csv(design, sep='\t') - meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0]) - work_pool = Pool(min(12,design_df.shape[0])) - return_list = work_pool.map(run_wrapper, meme_arglist) + meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0]) + work_pool = Pool(min(12, design_df.shape[0])) + return_list = work_pool.map(run_wrapper, meme_arglist) work_pool.close() work_pool.join() -if __name__=="__main__": - main() + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/overlap_peaks.py b/workflow/scripts/overlap_peaks.py index 434caa08799a2d6b3bf460ea8520d08a5b92c852..61f0c2e6d11553e54bd9cf5a1cc5c629fa9acb5c 100644 --- a/workflow/scripts/overlap_peaks.py +++ b/workflow/scripts/overlap_peaks.py @@ -113,9 +113,9 @@ def overlap(experiment, design): # with any one of the overlapping peak pairs >= 0.5 steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]), - awk_command, - cut_command, - 'sort -u'] + awk_command, + cut_command, + 'sort -u'] iter_true_peaks = iter(true_rep_peaks) next(iter_true_peaks) @@ -123,13 +123,13 @@ def overlap(experiment, design): if len(true_rep_peaks) > 1: for true_peak in true_rep_peaks[1:]: steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak), - awk_command, - cut_command, - 'sort -u']) + awk_command, + cut_command, + 'sort -u']) out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn) print("%d peaks overlap with both true replicates" % - (utils.count_lines(overlap_tr_fn))) + (utils.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 # where overlap is defined as the fractional overlap @@ -146,7 +146,7 @@ def overlap(experiment, design): out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn) print("%d peaks overlap with both pooled pseudoreplicates" - % (utils.count_lines(overlap_pr_fn))) + % (utils.count_lines(overlap_pr_fn))) # Make union of peak lists out, err = utils.run_pipe([ @@ -154,7 +154,7 @@ def overlap(experiment, design): 'sort -u' ], overlapping_peaks_fn) print("%d peaks overlap with true replicates or with pooled pseudorepliates" - % (utils.count_lines(overlapping_peaks_fn))) + % (utils.count_lines(overlapping_peaks_fn))) # Make rejected peak list out, err = utils.run_pipe([ @@ -187,7 +187,7 @@ def main(): # Make a design file for annotating Peaks anno_cols = ['Condition', 'Peaks'] - design_anno = pd.DataFrame(columns = anno_cols) + design_anno = pd.DataFrame(columns=anno_cols) # Find consenus overlap peaks for each experiment for experiment, df_experiment in design_peaks_df.groupby('experiment_id'): @@ -197,16 +197,16 @@ def main(): # Write out design files design_diff.columns = ['SampleID', - 'bamReads', - 'Condition', - 'Tissue', - 'Factor', - 'Treatment', - 'Replicate', - 'ControlID', - 'bamControl', - 'Peaks', - 'PeakCaller'] + 'bamReads', + 'Condition', + 'Tissue', + 'Factor', + 'Treatment', + 'Replicate', + 'ControlID', + 'bamControl', + 'Peaks', + 'PeakCaller'] design_diff.to_csv("design_diffPeaks.csv", header=True, sep=',', index=False) design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False) diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index 3481495e69af6d0a734052a4b4f391a917174f95..f3d702f64b0cf1b9283b8ab4e26e9fbe7df57922 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -3,11 +3,14 @@ '''Generate pooled and pseudoreplicate from data.''' import argparse -import utils import logging +import os +import subprocess +import shutil +import shlex import pandas as pd import numpy as np -import os +import utils EPILOG = ''' For more details: @@ -21,6 +24,12 @@ logger.addHandler(logging.NullHandler()) logger.propagate = False logger.setLevel(logging.INFO) +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe'] + def get_args(): '''Define arguments.''' @@ -87,21 +96,23 @@ def pool(tag_files, outfile, paired): else: file_extension = '.bedse.gz' - pooled_filename = outfile + file_extension + pool_basename = os.path.basename( + utils.strip_extensions(outfile, STRIP_EXTENSIONS)) + + pooled_filename = pool_basename + file_extension # Merge files out, err = utils.run_pipe([ 'gzip -dc %s' % (' '.join(tag_files)), - 'gzip -cn'], - outfile=pooled_filename) + 'gzip -cn'], outfile=pooled_filename) return pooled_filename def bedpe_to_tagalign(tag_file, outfile): - '''Convert read pairs to reads itno standard tagAlign file.''' + '''Convert read pairs to reads into standard tagAlign file.''' - se_tag_filename = outfile + "bedse.tagAlign.gz" + se_tag_filename = outfile + ".tagAlign.gz" # Convert read pairs to reads into standard tagAlign file tag_steps = ["zcat -f %s" % (tag_file)] @@ -122,7 +133,7 @@ def self_psuedoreplication(tag_file, prefix, paired): lines_per_rep = (no_lines+1)/2 # Make an array of number of psuedoreplicatesfile names - pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz' + pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz' for r in [0, 1]} # Shuffle and split file into equal parts @@ -132,21 +143,26 @@ def self_psuedoreplication(tag_file, prefix, paired): splits_prefix = 'temp_split' - out, err = utils.run_pipe([ - 'gzip -dc %s' % (tag_file), - 'shuf --random-source=%s' % (tag_file), - 'split -d -l %d - %s' % (lines_per_rep, splits_prefix)]) + psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | ' + psuedo_command += 'split -d -l {} - {}."' + psuedo_command = psuedo_command.format( + tag_file, + tag_file, + int(lines_per_rep), + splits_prefix) + logger.info("Running psuedo with %s", psuedo_command) + subprocess.check_call(shlex.split(psuedo_command)) + # Convert read pairs to reads into standard tagAlign file for i, index in enumerate([0, 1]): - string_index = '0' + str(index) + string_index = '.0' + str(index) steps = ['cat %s' % (splits_prefix + string_index)] if paired: steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) steps.extend(['gzip -cn']) out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i]) - os.remove(splits_prefix + string_index) return pseudoreplicate_dict @@ -213,7 +229,7 @@ def main(): # Update tagAlign with single end data if paired: design_new_df['tag_align'] = design_new_df['se_tag_align'] - design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True) + design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) design_new_df['replicate'] = design_new_df['replicate'].astype(str) design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' @@ -243,8 +259,6 @@ def main(): # Drop index column design_new_df.drop(labels='index', axis=1, inplace=True) - - else: # Make pool of replicates replicate_files = design_df.tag_align.unique() @@ -269,7 +283,7 @@ def main(): pool_pseudoreplicates_dict = {} for index, row in pseudoreplicates_df.iterrows(): replicate_id = index + 1 - pr_filename = experiment_id + ".pr" + str(replicate_id) + '.bedse.gz' + pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz' pool_replicate = pool(row, pr_filename, False) pool_pseudoreplicates_dict[replicate_id] = pool_replicate @@ -277,16 +291,16 @@ def main(): # Update tagAlign with single end data if paired: design_new_df['tag_align'] = design_new_df['se_tag_align'] - design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True) + design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) # Check controls against cutoff_ratio # if so replace with pool_control # unless single control was used if not single_control: path_to_pool_control = cwd + '/' + pool_control - if control_df.values.max() > 1.2: + if control_df.values.max() > cutoff_ratio: logger.info("Number of reads in controls differ by " + - " > factor of %f. Using pooled controls." % (cutoff_ratio)) + " > factor of %f. Using pooled controls." % (cutoff_ratio)) design_new_df['control_tag_align'] = path_to_pool_control else: for index, row in design_new_df.iterrows(): @@ -302,14 +316,14 @@ def main(): if paired: control = row['control_tag_align'] control_basename = os.path.basename( - utils.strip_extensions(control, ['.filt.nodup.bedpe.gz'])) - control_tmp = bedpe_to_tagalign(control , "control_basename") + utils.strip_extensions(control, STRIP_EXTENSIONS)) + control_tmp = bedpe_to_tagalign(control, control_basename) path_to_control = cwd + '/' + control_tmp design_new_df.loc[index, 'control_tag_align'] = \ path_to_control else: - path_to_pool_control = pool_control + path_to_pool_control = pool_control # Add in pseudo replicates tmp_metadata = design_new_df.loc[0].copy() @@ -333,7 +347,7 @@ def main(): # Write out new dataframe design_new_df.to_csv(experiment_id + '_ppr.tsv', - header=True, sep='\t', index=False) + header=True, sep='\t', index=False) if __name__ == '__main__': diff --git a/workflow/scripts/runDeepTools.py b/workflow/scripts/runDeepTools.py deleted file mode 100644 index 800544dc87c89329bf050369367e0cba9b805f28..0000000000000000000000000000000000000000 --- a/workflow/scripts/runDeepTools.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/python -# programmer : bbc -# usage: - -import sys -import argparse as ap -import logging -import subprocess -import pandas as pd -from multiprocessing import Pool -logging.basicConfig(level=10) - - -def prepare_argparser(): - description = "Make wig file for given bed using bam" - epilog = "For command line options of each command, type %(prog)% COMMAND -h" - argparser = ap.ArgumentParser(description=description, epilog = epilog) - argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file") - argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19") - #argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format") - #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"]) - #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header") - return(argparser) - -def run_qc(files, controls, labels): - mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz" - p = subprocess.Popen(mbs_command, shell=True) - #logging.debug(mbs_command) - p.communicate() - pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels) - #logging.debug(pcor_command) - p = subprocess.Popen(pcor_command, shell=True) - p.communicate() - #plotCoverage - pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10" - p = subprocess.Popen(pcov_command, shell=True) - p.communicate() - #draw fingerprints plots - for treat,ctrl,name in zip(files,controls,labels): - fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png" - p = subprocess.Popen(fp_command, shell=True) - p.communicate() - -def bam2bw_wrapper(command): - p = subprocess.Popen(command, shell=True) - p.communicate() - -def run_signal(files, labels, genome): - #compute matrix and draw profile and heatmap - gene_bed = genome+"/gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed" - bw_commands = [] - for f in files: - bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw")) - work_pool = Pool(min(len(files), 12)) - work_pool.map(bam2bw_wrapper, bw_commands) - work_pool.close() - work_pool.join() - - cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz" - p = subprocess.Popen(cm_command, shell=True) - p.communicate() - hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png" - p = subprocess.Popen(hm_command, shell=True) - p.communicate() - -def run(dfile,genome): - #parse dfile, suppose data files are the same folder as design file - dfile = pd.read_csv(dfile) - #QC: multiBamSummary and plotCorrelation - run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID']) - #signal plots - run_signal(dfile['bamReads'],dfile['SampleID'],genome) - -def main(): - argparser = prepare_argparser() - args = argparser.parse_args() - run(args.infile, args.genome) - - -if __name__=="__main__": - main() diff --git a/workflow/scripts/trim_reads.py b/workflow/scripts/trim_reads.py index 036c5f700eb61011cce4008230b60c634ea1c582..e31ec938b6d7e0ef5570afc394c98a365041f863 100644 --- a/workflow/scripts/trim_reads.py +++ b/workflow/scripts/trim_reads.py @@ -5,6 +5,7 @@ import subprocess import argparse import shutil +import os import logging EPILOG = ''' @@ -32,6 +33,10 @@ def get_args(): nargs='+', required=True) + parser.add_argument('-s', '--sample', + help="The name of the sample.", + required=True) + parser.add_argument('-p', '--paired', help="True/False if paired-end or single end.", default=False, @@ -61,6 +66,32 @@ def check_tools(): raise Exception('Missing cutadapt') +def rename_reads(fastq, sample, paired): + '''Rename fastq files by sample name.''' + + # Get current directory to build paths + cwd = os.getcwd() + + renamed_fastq = [] + + if paired: # paired-end data + # Set file names + renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz') + renamed_fastq.append(cwd + '/' + sample + '_R2.fastq.gz') + + # Great symbolic links + os.symlink(fastq[0], renamed_fastq[0]) + os.symlink(fastq[1], renamed_fastq[1]) + else: + # Set file names + renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz') + + # Great symbolic links + os.symlink(fastq[0], renamed_fastq[0]) + + return renamed_fastq + + def trim_reads(fastq, paired): '''Run trim_galore on 1 or 2 files.''' @@ -82,6 +113,7 @@ def trim_reads(fastq, paired): def main(): args = get_args() fastq = args.fastq + sample = args.sample paired = args.paired # Create a file handler @@ -91,8 +123,11 @@ def main(): # Check if tools are present check_tools() + # Rename fastq files by sample + fastq_rename = rename_reads(fastq, sample, paired) + # Run trim_reads - trim_reads(fastq, paired) + trim_reads(fastq_rename, paired) if __name__ == '__main__': diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py index a50167367eec13fc0548bfbffbfa49fa8db58c60..6c2da13b056b386e454a309552f5a3623b1ef254 100644 --- a/workflow/scripts/utils.py +++ b/workflow/scripts/utils.py @@ -16,7 +16,6 @@ logger.propagate = True def run_pipe(steps, outfile=None): - # TODO: capture stderr from subprocess import Popen, PIPE p = None p_next = None @@ -98,13 +97,13 @@ def rescale_scores(filename, scores_col, new_min=10, new_max=1000): 'head -n 1 %s' % (sorted_fn), 'cut -f %s' % (scores_col)]) max_score = float(out.strip()) - logger.info("rescale_scores: max_score = %s" % (max_score)) + logger.info("rescale_scores: max_score = %s", max_score) out, err = run_pipe([ 'tail -n 1 %s' % (sorted_fn), 'cut -f %s' % (scores_col)]) min_score = float(out.strip()) - logger.info("rescale_scores: min_score = %s" % (min_score)) + logger.info("rescale_scores: min_score = %s", min_score) a = min_score b = max_score diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py index f799137aa41caddf5f27729be9abb0eebeeb7482..ac2ff65f7f06707e892e1033106dc2219ecb96cd 100644 --- a/workflow/scripts/xcor.py +++ b/workflow/scripts/xcor.py @@ -22,6 +22,13 @@ logger.propagate = False logger.setLevel(logging.INFO) +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe'] + + def get_args(): '''Define arguments.''' @@ -60,20 +67,20 @@ def check_tools(): def xcor(tag, paired): '''Use spp to calculate cross-correlation stats.''' - tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz'])) + tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS)) uncompressed_tag_filename = tag_basename # Subsample tagAlign file - NREADS = 15000000 + number_reads = 15000000 subsampled_tag_filename = \ - tag_basename + ".%d.tagAlign.gz" % (NREADS/1000000) + tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000) steps = [ 'zcat %s' % (tag), 'grep -v "chrM"', - 'shuf -n %d --random-source=%s' % (NREADS, tag)] + 'shuf -n %d --random-source=%s' % (number_reads, tag)] if paired: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) @@ -83,8 +90,8 @@ def xcor(tag, paired): out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename) # Calculate Cross-correlation QC scores - cc_scores_filename = subsampled_tag_filename + ".cc.qc" - cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf" + cc_scores_filename = tag_basename + ".cc.qc" + cc_plot_filename = tag_basename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> @@ -109,6 +116,7 @@ def xcor(tag, paired): return cc_scores_filename + def main(): args = get_args() paired = args.paired @@ -122,7 +130,7 @@ def main(): check_tools() # Calculate Cross-correlation - xcor_filename = xcor(tag, paired) + xcor(tag, paired) if __name__ == '__main__': diff --git a/workflow/tests/test_annotate_peaks.py b/workflow/tests/test_annotate_peaks.py index 692ddced7762a9a9742c6bf51e652673b62bdd1f..ee3ebe12983b82607d7f3fe5559869633a7a8a72 100644 --- a/workflow/tests/test_annotate_peaks.py +++ b/workflow/tests/test_annotate_peaks.py @@ -11,18 +11,33 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend -def test_annotate_peaks_singleend(): +def test_pie_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_pie.pdf')) + + +@pytest.mark.singleend +def test_upsetplot_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_upsetplot.pdf')) + +@pytest.mark.singleend +def test_annotation_singleend(): annotation_file = test_output_path + 'ENCSR238SGC.chipseeker_annotation.tsv' assert os.path.exists(annotation_file) - assert utils.count_lines(annotation_file) == 152840 + assert utils.count_lines(annotation_file) == 152255 @pytest.mark.pairedend -def test_annotate_peaks_pairedend(): +def test_pie_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_pie.pdf')) + + +@pytest.mark.pairedend +def test_upsetplot_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_upsetplot.pdf')) + + +@pytest.mark.pairedend +def test_annotation_pairedend(): annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv' assert os.path.exists(annotation_file) - assert utils.count_lines(annotation_file) == 25391 + assert utils.count_lines(annotation_file) == 25494 diff --git a/workflow/tests/test_call_peaks_macs.py b/workflow/tests/test_call_peaks_macs.py index 71358bc045801d23d28b8f7385c5ef9f608347b6..9f3f6c0778a888ff30c795fba2f2126a4a3bb672 100644 --- a/workflow/tests/test_call_peaks_macs.py +++ b/workflow/tests/test_call_peaks_macs.py @@ -9,16 +9,42 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend -def test_call_peaks_macs_singleend(): +def test_fc_signal_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.fc_signal.bw')) + + +@pytest.mark.singleend +def test_pvalue_signal_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.pvalue_signal.bw')) - peak_file = test_output_path + 'ENCLB144FDT_peaks.narrowPeak' + + +@pytest.mark.singleend +def test_peaks_xls_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_peaks.xls')) + + +@pytest.mark.singleend +def test_peaks_bed_singleend(): + peak_file = test_output_path + 'ENCLB144FDT.narrowPeak' assert utils.count_lines(peak_file) == 227389 @pytest.mark.pairedend -def test_call_peaks_macs_pairedend(): +def test_fc_signal_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.fc_signal.bw')) + + +@pytest.mark.pairedend +def test_pvalue_signal_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.pvalue_signal.bw')) - peak_file = test_output_path + 'ENCLB568IYX_peaks.narrowPeak' - assert utils.count_lines(peak_file) == 112652 + + +@pytest.mark.pairedend +def test_peaks_xls_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX_peaks.xls')) + + +@pytest.mark.pairedend +def test_peaks_bed_pairedend(): + peak_file = test_output_path + 'ENCLB568IYX.narrowPeak' + assert utils.count_lines(peak_file) == 113821 diff --git a/workflow/tests/test_check_design.py b/workflow/tests/test_check_design.py index 517c53c71cb31edb18c159b29636381aa66972d0..de02891773f3a090ea476ceee60a8c30f052c57d 100644 --- a/workflow/tests/test_check_design.py +++ b/workflow/tests/test_check_design.py @@ -63,6 +63,24 @@ def design_4(design): return design +@pytest.fixture +def design_5(design): + # Update sample_id to have -, spaces or periods + design.loc[design['sample_id'] == 'A_1', 'sample_id'] = 'A 1' + design.loc[design['sample_id'] == 'A_2', 'sample_id'] = 'A.2' + design.loc[design['sample_id'] == 'B_1', 'sample_id'] = 'B-1' + return design + + +@pytest.fixture +def design_6(design): + # Update experiment_id to have -, spaces or periods + design.loc[design['sample_id'] == 'A_1', 'experiment_id'] = 'A ChIP' + design.loc[design['sample_id'] == 'A_2', 'experiment_id'] = 'A.ChIP' + design.loc[design['sample_id'] == 'B_1', 'experiment_id'] = 'B-ChIP' + return design + + @pytest.fixture def fastq_files_1(fastq_files): # Drop B_2.fastq.gz @@ -115,10 +133,25 @@ def test_check_files_output_pairedend(design_3, fastq_files): assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz" - @pytest.mark.unit def test_check_replicates(design_4): paired = False with pytest.raises(Exception) as excinfo: new_design = check_design.check_replicates(design_4) assert str(excinfo.value) == "Duplicate replicates in experiments: ['B']" + + +@pytest.mark.unit +def test_check_samples(design_5): + paired = False + with pytest.raises(Exception) as excinfo: + new_design = check_design.check_samples(design_5) + assert str(excinfo.value) == "Malformed samples from design file: ['A 1', 'A.2', 'B-1']" + + +@pytest.mark.unit +def test_check_experiments(design_6): + paired = False + with pytest.raises(Exception) as excinfo: + new_design = check_design.check_experiments(design_6) + assert str(excinfo.value) == "Malformed experiment from design file: ['A ChIP', 'A.ChIP', 'B-ChIP']" diff --git a/workflow/tests/test_convert_reads.py b/workflow/tests/test_convert_reads.py index ff8a003bfc91f5778648b238a2d94a1e04c366cc..8dda74678d42c0049eb74caa205ad8b662772bb5 100644 --- a/workflow/tests/test_convert_reads.py +++ b/workflow/tests/test_convert_reads.py @@ -8,12 +8,20 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend -def test_convert_reads_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.tagAlign.gz')) - assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bedse.gz')) +def test_tag_reads_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI.tagAlign.gz')) + + +@pytest.mark.singleend +def test_bed_reads_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI.bedse.gz')) + + +@pytest.mark.pairedend +def test_tag_reads_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.tagAlign.gz')) @pytest.mark.pairedend -def test_map_qc_pairedend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.tagAlign.gz')) - assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bedpe.gz')) +def test_bed_reads_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.bedpe.gz')) diff --git a/workflow/tests/test_diff_peaks.py b/workflow/tests/test_diff_peaks.py index 1d48b6182efd600400b8d2032732dfbb62b704e7..b0a76aaece94d6b795efee271ffd64404da8ad92 100644 --- a/workflow/tests/test_diff_peaks.py +++ b/workflow/tests/test_diff_peaks.py @@ -15,25 +15,58 @@ def test_diff_peaks_singleend_single_rep(): assert os.path.isdir(test_output_path) == False @pytest.mark.pairedend -def test_annotate_peaks_pairedend_single_rep(): +def test_diff_peaks_pairedend_single_rep(): assert os.path.isdir(test_output_path) == False @pytest.mark.singlediff -def test_diff_peaks_singleend_multiple_rep(): +def test_heatmap_singleend_multiple_rep(): assert os.path.exists(os.path.join(test_output_path, 'heatmap.pdf')) + + +@pytest.mark.singlediff +def test_pca_singleend_multiple_rep(): assert os.path.exists(os.path.join(test_output_path, 'pca.pdf')) + + +@pytest.mark.singlediff +def test_normcount_singleend_multiple_rep(): assert os.path.exists(os.path.join(test_output_path, 'normcount_peaksets.txt')) - assert os.path.exists(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed')) - diffbind_file = test_output_path + 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.csv' + + +@pytest.mark.singlediff +def test_diffbind_singleend_multiple_rep(): + if os.path.isfile(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed')): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed')) + diffbind_file = test_output_path + 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.csv' + elif os.path.isfile(os.path.join(test_output_path, 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.bed')): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.bed')) + diffbind_file = test_output_path + 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.csv' assert os.path.exists(diffbind_file) - assert utils.count_lines(diffbind_file) == 201039 + assert utils.count_lines(diffbind_file) == 197471 + @pytest.mark.paireddiff -def test_annotate_peaks_pairedend_single_rep(): +def test_heatmap_pairedend_single_rep(): assert os.path.exists(os.path.join(test_output_path, 'heatmap.pdf')) + + +@pytest.mark.paireddiff +def test_pca_pairedend_single_rep(): assert os.path.exists(os.path.join(test_output_path, 'pca.pdf')) + + +@pytest.mark.paireddiff +def test_normcount_pairedend_single_rep(): assert os.path.exists(os.path.join(test_output_path, 'normcount_peaksets.txt')) - assert os.path.exists(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed')) - diffbind_file = test_output_path + 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.csv' + + +@pytest.mark.paireddiff +def test_diffbind_pairedend_single_rep(): + if os.path.isfile(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed')): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed')) + diffbind_file = test_output_path + 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.csv' + elif os.path.isfile(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed')): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed')) + diffbind_file = test_output_path + 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.csv' assert os.path.exists(diffbind_file) assert utils.count_lines(diffbind_file) == 66201 diff --git a/workflow/tests/test_experiment_qc.py b/workflow/tests/test_experiment_qc.py index 3c4c9d0411faced4392b85f97e318d9ba58cf792..c0cd5d7d340b27b3217620ef4b12a1391841820b 100644 --- a/workflow/tests/test_experiment_qc.py +++ b/workflow/tests/test_experiment_qc.py @@ -31,19 +31,44 @@ def test_check_update_controls(design_bam): @pytest.mark.singleend -def test_experiment_qc_singleend(): +def test_coverage_singleend(): assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) + assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf')) + + +@pytest.mark.singleend +def test_spearman_singleend(): assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf')) + + +@pytest.mark.singleend +def test_pearson_singleend(): assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf')) - assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf')) + + +@pytest.mark.singleend +def test_fingerprint_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.pdf')) assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.pdf')) + @pytest.mark.pairdend -def test_experiment_qc_pairedend(): +def test_coverage_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) + assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf')) + + +@pytest.mark.pairdend +def test_spearman_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf')) + + +@pytest.mark.pairdend +def test_pearson_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf')) - assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf')) + + +@pytest.mark.pairdend +def test_fingerprint_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX_fingerprint.pdf')) assert os.path.exists(os.path.join(test_output_path, 'ENCLB637LZP_fingerprint.pdf')) diff --git a/workflow/tests/test_map_qc.py b/workflow/tests/test_map_qc.py index 95841df0aa2e6150da1136cdb295eb93651c5a18..5a94421557fef672044a12ad083278ba9ead6d8a 100644 --- a/workflow/tests/test_map_qc.py +++ b/workflow/tests/test_map_qc.py @@ -8,31 +8,49 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/filterReads/' +@pytest.mark.singleend +def test_dedup_files_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI.dedup.bam')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI.dedup.bam.bai')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI.dedup.qc')) + + @pytest.mark.singleend def test_map_qc_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam')) - assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai')) - filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc' + filtered_reads_report = test_output_path + 'ENCLB831RUI.dedup.flagstat.qc' samtools_report = open(filtered_reads_report).readlines() assert '64962570 + 0 in total' in samtools_report[0] assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4] - library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc' + + +@pytest.mark.singleend +def test_library_complexity_singleend(): + library_complexity = test_output_path + 'ENCLB831RUI.pbc.qc' df_library_complexity = pd.read_csv(library_complexity, sep='\t') assert df_library_complexity["NRF"].iloc[0] == 0.926192 assert df_library_complexity["PBC1"].iloc[0] == 0.926775 assert df_library_complexity["PBC2"].iloc[0] == 13.706885 +@pytest.mark.pairedend +def test_dedup_files_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.dedup.bam')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.dedup.bam.bai')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.dedup.qc')) + + @pytest.mark.pairedend def test_map_qc_pairedend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bam')) - assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bam.bai')) - filtered_reads_report = test_output_path + 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.flagstat.qc' + filtered_reads_report = test_output_path + 'ENCLB568IYX.dedup.flagstat.qc' samtools_report = open(filtered_reads_report).readlines() - assert '47389080 + 0 in total' in samtools_report[0] - assert '47389080 + 0 mapped (100.00%:N/A)' in samtools_report[4] - library_complexity = test_output_path + 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.pbc.qc' + assert '47388510 + 0 in total' in samtools_report[0] + assert '47388510 + 0 mapped (100.00%:N/A)' in samtools_report[4] + + +@pytest.mark.pairedend +def test_library_complexity_pairedend(): + library_complexity = test_output_path + 'ENCLB568IYX.pbc.qc' df_library_complexity = pd.read_csv(library_complexity, sep='\t') assert df_library_complexity["NRF"].iloc[0] == 0.947064 - assert df_library_complexity["PBC1"].iloc[0] == 0.946724 - assert df_library_complexity["PBC2"].iloc[0] == 18.643039 + assert round(df_library_complexity["PBC1"].iloc[0],6) == 0.946723 + assert round(df_library_complexity["PBC2"].iloc[0],6) == 18.642645 diff --git a/workflow/tests/test_map_reads.py b/workflow/tests/test_map_reads.py index 328858216abb88528c2d25e06bce20d7d469f1cb..60a2e39009f64f2a554d200ba41f1f4d28fe92a8 100644 --- a/workflow/tests/test_map_reads.py +++ b/workflow/tests/test_map_reads.py @@ -9,8 +9,8 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend def test_map_reads_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.srt.bam')) - aligned_reads_report = test_output_path + 'ENCFF646LXU.srt.bam.flagstat.qc' + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI.bam')) + aligned_reads_report = test_output_path + 'ENCLB831RUI.flagstat.qc' samtools_report = open(aligned_reads_report).readlines() assert '80795025 + 0 in total' in samtools_report[0] assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4] @@ -18,8 +18,8 @@ def test_map_reads_singleend(): @pytest.mark.pairedend def test_map_reads_pairedend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF002DTU_val_1ENCFF002EFI_val_2.srt.bam')) - aligned_reads_report = test_output_path + 'ENCFF002DTU_val_1ENCFF002EFI_val_2.srt.bam.flagstat.qc' + assert os.path.exists(os.path.join(test_output_path, 'ENCLB678IDC.bam')) + aligned_reads_report = test_output_path + 'ENCLB678IDC.flagstat.qc' samtools_report = open(aligned_reads_report).readlines() assert '72660890 + 0 in total' in samtools_report[0] assert '72053925 + 0 mapped (99.16% : N/A)' in samtools_report[4] diff --git a/workflow/tests/test_motif_search.py b/workflow/tests/test_motif_search.py index ca84f4a60745e5f9080ba40fe148787b5bd740cf..8c1265211b87da7d006b9020087ce04994889fd0 100644 --- a/workflow/tests/test_motif_search.py +++ b/workflow/tests/test_motif_search.py @@ -10,18 +10,27 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/motifSearch/' +@pytest.mark.singleend +def test_limited_peaks_singleend(): + peak_file_ENCSR238SGC = test_output_path + 'ENCSR238SGC.600.narrowPeak' + assert os.path.exists(peak_file_ENCSR238SGC) + assert utils.count_lines(peak_file_ENCSR238SGC) == 600 + + @pytest.mark.singleend def test_motif_search_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC_memechip', 'ENCSR238SGC.fa')) assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC_memechip', 'index.html')) - peak_file_ENCSR238SGC = test_output_path + 'sorted-ENCSR238SGC.replicated.narrowPeak' - assert os.path.exists(peak_file_ENCSR238SGC) - assert utils.count_lines(peak_file_ENCSR238SGC) == 600 + +@pytest.mark.pairedend +def test_limited_peaks_pairedend(): + peak_file_ENCSR729LGA= test_output_path + 'ENCSR729LGA.600.narrowPeak' + assert os.path.exists(peak_file_ENCSR729LGA) + assert utils.count_lines(peak_file_ENCSR729LGA) == 600 + + @pytest.mark.pairedend def test_motif_search_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_memechip', 'ENCSR729LGA.fa')) assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_memechip', 'index.html')) - peak_file_ENCSR729LGA= test_output_path + 'sorted-ENCSR729LGA.replicated.narrowPeak' - assert os.path.exists(peak_file_ENCSR729LGA) - assert utils.count_lines(peak_file_ENCSR729LGA) == 600 diff --git a/workflow/tests/test_overlap_peaks.py b/workflow/tests/test_overlap_peaks.py index 99d43b87939617c801bad0389f14e2683cde0f82..ff61c937f42af911bba8e8e21acde80bd41a592e 100644 --- a/workflow/tests/test_overlap_peaks.py +++ b/workflow/tests/test_overlap_peaks.py @@ -37,11 +37,11 @@ def test_check_update_design(design_diff): def test_overlap_peaks_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.rejected.narrowPeak')) peak_file = test_output_path + 'ENCSR238SGC.replicated.narrowPeak' - assert utils.count_lines(peak_file) == 152848 + assert utils.count_lines(peak_file) == 152262 @pytest.mark.pairedend def test_overlap_peaks_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.rejected.narrowPeak')) peak_file = test_output_path + 'ENCSR729LGA.replicated.narrowPeak' - assert utils.count_lines(peak_file) == 25655 + assert utils.count_lines(peak_file) == 25758 diff --git a/workflow/tests/test_trim_reads.py b/workflow/tests/test_trim_reads.py index 8e2176d7044392f9ed9e00801b3e14d1c627874d..ca25db6a9131abe26ded6b8746c11cecda502a49 100644 --- a/workflow/tests/test_trim_reads.py +++ b/workflow/tests/test_trim_reads.py @@ -13,20 +13,28 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend def test_trim_reads_singleend(): raw_fastq = test_data_path + 'ENCFF833BLU.fastq.gz' - trimmed_fastq = test_output_path + 'ENCFF833BLU_trimmed.fq.gz' - trimmed_fastq_report = test_output_path + \ - 'ENCFF833BLU.fastq.gz_trimming_report.txt' + trimmed_fastq = test_output_path + 'ENCLB144FDT_R1_trimmed.fq.gz' assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq) assert os.path.getsize(trimmed_fastq) == 2512853101 + + +@pytest.mark.singleend +def test_trim_report_singleend(): + trimmed_fastq_report = test_output_path + \ + 'ENCLB144FDT_R1.fastq.gz_trimming_report.txt' assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4] @pytest.mark.pairedend def test_trim_reads_pairedend(): raw_fastq = test_data_path + 'ENCFF582IOZ.fastq.gz' - trimmed_fastq = test_output_path + 'ENCFF582IOZ_val_2.fq.gz' - trimmed_fastq_report = test_output_path + \ - 'ENCFF582IOZ.fastq.gz_trimming_report.txt' + trimmed_fastq = test_output_path + 'ENCLB637LZP_R2_val_2.fq.gz' assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq) assert os.path.getsize(trimmed_fastq) == 2229312710 + + +@pytest.mark.pairedend +def test_trim_report_pairedend(): + trimmed_fastq_report = test_output_path + \ + 'ENCLB637LZP_R2.fastq.gz_trimming_report.txt' assert 'Trimming mode: paired-end' in open(trimmed_fastq_report).readlines()[4] diff --git a/workflow/tests/test_xcor.py b/workflow/tests/test_xcor.py index 2d3fdcff16d3a42c5af2e7c4beb6f777e0ddbba0..259ecfb1f82ce80e79ee14539c4bb603bb5ddfb4 100644 --- a/workflow/tests/test_xcor.py +++ b/workflow/tests/test_xcor.py @@ -9,9 +9,13 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend -def test_cross_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf')) - qc_file = os.path.join(test_output_path,"ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc") +def test_cross_plot_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.cc.plot.pdf')) + + +@pytest.mark.singleend +def test_cross_qc_singleend(): + qc_file = os.path.join(test_output_path,"ENCLB144FDT.cc.qc") df_xcor = pd.read_csv(qc_file, sep="\t", header=None) assert df_xcor[2].iloc[0] == '190,200,210' assert df_xcor[8].iloc[0] == 1.025906 @@ -19,10 +23,14 @@ def test_cross_singleend(): @pytest.mark.pairedend -def test_cross_pairedend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF582IOZ_val_2ENCFF957SQS_val_1.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf')) - qc_file = os.path.join(test_output_path,"ENCFF582IOZ_val_2ENCFF957SQS_val_1.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc") +def test_cross_qc_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.cc.plot.pdf')) + + +@pytest.mark.pairedend +def test_cross_plot_pairedend(): + qc_file = os.path.join(test_output_path,"ENCLB568IYX.cc.qc") df_xcor = pd.read_csv(qc_file, sep="\t", header=None) - assert df_xcor[2].iloc[0] == '210,220,475' - assert round(df_xcor[8].iloc[0],6) == 1.062032 - assert df_xcor[9].iloc[0] == 3.737722 + assert df_xcor[2].iloc[0] == '220,430,475' + assert round(df_xcor[8].iloc[0],6) == 1.060018 + assert df_xcor[9].iloc[0] == 4.099357