Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • BICF/Astrocyte/chipseq_analysis
  • s190984/chipseq_analysis
  • astrocyte/workflows/bicf/chipseq_analysis
  • s219741/chipseq-analysis-containerized
Show changes
Showing
with 334 additions and 146 deletions
...@@ -4,11 +4,28 @@ profiles { ...@@ -4,11 +4,28 @@ profiles {
} }
} }
trace {
enabled = true
file = 'pipeline_trace.txt'
fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
}
timeline {
enabled = true
file = 'timeline.html'
}
report {
enabled = true
file = 'report.html'
}
manifest { manifest {
name = 'chipseq_analysis' name = 'chipseq_analysis'
description = 'BICF ChIP-seq Analysis Workflow.' description = 'BICF ChIP-seq Analysis Workflow.'
homePage = 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis' homePage = 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis'
version = '1.0.6' version = '1.1.2'
mainScript = 'main.nf' mainScript = 'main.nf'
nextflowVersion = '>=0.31.0' nextflowVersion = '>=0.31.0'
} }
...@@ -6,40 +6,27 @@ ...@@ -6,40 +6,27 @@
#* -------------------------------------------------------------------------- #* --------------------------------------------------------------------------
#* #*
#Currently Human or Mouse
# Load libraries # Load libraries
library("ChIPseeker") library("ChIPseeker")
library(GenomicFeatures)
# Currently mouse or human
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Mmusculus.UCSC.mm10.knownGene")
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
library("org.Hs.eg.db")
library("org.Mm.eg.db")
# Create parser object # Create parser object
args <- commandArgs(trailingOnly=TRUE) args <- commandArgs(trailingOnly=TRUE)
# Check input args # Check input args
if (length(args) != 2) { if (length(args) != 3) {
stop("Usage: annotate_peaks.R annotate_design.tsv genome_assembly", call.=FALSE) stop("Usage: annotate_peaks.R annotate_design.tsv gtf geneNames", call.=FALSE)
} }
design_file <- args[1] design_file <- args[1]
genome_assembly <- args[2] gtf <- args[2]
geneNames <- args[3]
# Load UCSC Known Genes # Load UCSC Known Genes
if(genome_assembly=='GRCh37') { txdb <- makeTxDbFromGFF(gtf)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene sym <- read.table(geneNames, header=T, sep='\t') [,4:5]
annodb <- 'org.Hs.eg.db'
} else if(genome_assembly=='GRCm38') {
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
annodb <- 'org.Mm.eg.db'
} else if(genome_assembly=='GRCh38') {
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
annodb <- 'org.Hs.eg.db'
}
# Output version of ChIPseeker # Output version of ChIPseeker
chipseeker_version = packageVersion('ChIPseeker') chipseeker_version = packageVersion('ChIPseeker')
...@@ -54,18 +41,19 @@ names(files) <- design$Condition ...@@ -54,18 +41,19 @@ names(files) <- design$Condition
# Granges of files # Granges of files
peaks <- lapply(files, readPeakFile, as = "GRanges", header = FALSE) peaks <- lapply(files, readPeakFile, as = "GRanges", header = FALSE)
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, annoDb=annodb, tssRegion=c(-3000, 3000), verbose=FALSE) peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
column_names <- c("chr", "start", "end", "width", "strand_1", "name", "score", "strand", "signalValue", column_names <- c("geneId","chr", "start", "end", "width", "strand_1", "name", "score", "strand", "signalValue",
"pValue", "qValue", "peak", "annotation", "geneChr", "geneStart", "geneEnd", "pValue", "qValue", "peak", "annotation", "geneChr", "geneStart", "geneEnd",
"geneLength" ,"geneStrand", "geneId", "transcriptId", "distanceToTSS", "geneLength" ,"geneStrand", "transcriptId", "distanceToTSS", "symbol")
"ENSEMBL", "symbol", "geneName")
for(index in c(1:length(peakAnnoList))) { for(index in c(1:length(peakAnnoList))) {
filename <- paste(names(peaks)[index], ".chipseeker_annotation.tsv", sep="") filename <- paste(names(peaks)[index], ".chipseeker_annotation.tsv", sep="")
df <- as.data.frame(peakAnnoList[[index]]) df <- as.data.frame(peakAnnoList[[index]])
colnames(df) <- column_names df$geneId <- sapply(strsplit(as.character(df$geneId), split = "\\."), "[[", 1)
write.table(df[ , !(names(df) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F) df_final <- merge(df, sym, by.x="geneId", by.y="ensembl", all.x=T)
colnames(df_final) <- column_names
write.table(df_final[ , !(names(df_final) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F)
# Draw individual plots # Draw individual plots
......
...@@ -138,8 +138,20 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) ...@@ -138,8 +138,20 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
with open(xcor, 'r') as xcor_fh: with open(xcor, 'r') as xcor_fh:
firstline = xcor_fh.readline() firstline = xcor_fh.readline()
frag_lengths = firstline.split()[2] # third column frag_lengths = firstline.split()[2] # third column
fragment_length = frag_lengths.split(',')[0] # grab first value frag_lengths_array = frag_lengths.split(',')
logger.info("Fraglen %s", fragment_length) fragment_length = 0
fragment = False
# Loop through all values of fragment length
for f in frag_lengths.split(','):
fragment_length = f
logger.info("Fraglen %s", fragment_length)
if int(fragment_length) > 50:
fragment = True
break
if fragment == False:
logger.info('Error in cross-correlation analysis: %s', frag_lengths_array)
raise Exception("Error in cross-correlation analysis: %s" % frag_lengths_array)
# Generate narrow peaks and preliminary signal tracks # Generate narrow peaks and preliminary signal tracks
......
...@@ -46,7 +46,7 @@ SOFTWARE_REGEX = { ...@@ -46,7 +46,7 @@ SOFTWARE_REGEX = {
'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"], 'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"],
'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""], 'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""],
'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"], 'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"],
'Python': ['version_python.txt', r"python, version (\S+)"], 'Python': ['version_python.txt', r"Python (\S+)"],
'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"], 'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"],
} }
......
#!/bin/bash
#plotProfile.sh
bws=`ls *.bw`
gtf=`ls *.gtf *.bed`
computeMatrix reference-point \
--referencePoint TSS \
-S $bws \
-R $gtf \
--skipZeros \
-o computeMatrix.gz
plotProfile -m computeMatrix.gz \
-out plotProfile.png \
#!/bin/bash
#plot_profile.sh
script_name="plot_profile.sh"
#Help function
usage() {
echo "-h --Help documentation for $script_name"
echo "-g --File path to gtf/bed files"
echo "Example: $script_name -g 'genome.gtf'"
exit 1
}
raise()
{
echo "${1}" >&2
}
check_tools() {
raise "
Checking for required libraries and components on this system
"
deeptools --version &> version_deeptools.txt
if [ $? -gt 0 ]
then
raise "Missing deeptools"
return 1
fi
}
compute_matrix() {
raise "
Computing matrix on ${1} using ${2}
"
computeMatrix reference-point \
--referencePoint TSS \
-S ${1} \
-R ${2} \
--skipZeros \
-o computeMatrix.gz \
-p max/2
if [ $? -gt 0 ]
then
raise "Problem building matrix"
return 1
fi
}
plot_profile() {
raise "
Plotting profile
"
plotProfile -m computeMatrix.gz \
-out plotProfile.png
if [ $? -gt 0 ]
then
raise "Problem plotting"
return 1
fi
}
run_main() {
# Parsing options
OPTIND=1 # Reset OPTIND
while getopts :g:h opt
do
case $opt in
g) gtf=$OPTARG;;
h) usage;;
esac
done
shift $(($OPTIND -1))
# Check for mandatory options
if [[ -z $gtf ]]; then
usage
fi
bws=$(ls *pooled.fc_signal.bw)
check_tools || exit 1
compute_matrix "${bws}" "${gtf}" || return 1
plot_profile || return 1
raise "ALL COMPLETE"
}
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]
then
run_main "$@"
if [ $? -gt 0 ]
then
exit 1
fi
fi
...@@ -204,83 +204,43 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con ...@@ -204,83 +204,43 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con
pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control") pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control")
pool_control = pool_control_tmp pool_control = pool_control_tmp
if not replicated:
# Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data
experiment_id = design_df.at[0, 'experiment_id'] experiment_id = design_df.at[0, 'experiment_id']
replicate = design_df.at[0, 'replicate'] replicate_files = design_df.tag_align.unique()
design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired)
# Update tagAlign with single end data # Make 2 self psuedoreplicates
if paired: pseudoreplicates_dict = {}
design_new_df['tag_align'] = design_new_df['se_tag_align'] for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) replicate_prefix = experiment_id + '_' + str(rep)
pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired)
design_new_df['replicate'] = design_new_df['replicate'].astype(str) pseudoreplicates_dict[rep] = pr_dict
design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1'
design_new_df.at[1, 'replicate'] = '1_pr' # Update design to include new self pseudo replicates
design_new_df.at[1, 'xcor'] = 'Calculate' pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict)
design_new_df.at[2, 'sample_id'] = experiment_id + '_pr2' pool_pseudoreplicates_dict = {}
design_new_df.at[2, 'replicate'] = '2_pr' for index, row in pseudoreplicates_df.iterrows():
design_new_df.at[2, 'xcor'] = 'Calculate' replicate_id = index + 1
design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled' pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
design_new_df.at[3, 'replicate'] = 'pooled' pool_replicate = pool(row, pr_filename, False)
design_new_df.at[3, 'xcor'] = 'Calculate' pool_pseudoreplicates_dict[replicate_id] = pool_replicate
design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align']
design_new_df = design_df #.loc[np.repeat(design_df.index, 4)].reset_index()
# Make 2 self psuedoreplicates # Update tagAlign with single end data
self_pseudoreplicates_dict = {} if paired:
for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): design_new_df['tag_align'] = design_new_df['se_tag_align']
replicate_prefix = experiment_id + '_' + str(rep) design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
self_pseudoreplicates_dict = \
self_psuedoreplication(tag_file, replicate_prefix, paired)
# Update design to include new self pseudo replicates
for rep, pseudorep_file in self_pseudoreplicates_dict.items():
path_to_file = cwd + '/' + pseudorep_file
replicate = rep + 1
design_new_df.loc[replicate, 'tag_align'] = path_to_file
# Drop index column
design_new_df.drop(labels='index', axis=1, inplace=True)
# If paired change to single End
if paired:
pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled")
else: else:
# Make pool of replicates pool_experiment_se = pool_experiment
replicate_files = design_df.tag_align.unique()
experiment_id = design_df.at[0, 'experiment_id']
pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired)
# If paired change to single End
if paired:
pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled")
else:
pool_experiment_se = pool_experiment
# Make self psuedoreplicates equivalent to number of replicates
pseudoreplicates_dict = {}
for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
replicate_prefix = experiment_id + '_' + str(rep)
pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired)
pseudoreplicates_dict[rep] = pr_dict
# Merge self psuedoreplication for each true replicate
pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict)
pool_pseudoreplicates_dict = {}
for index, row in pseudoreplicates_df.iterrows():
replicate_id = index + 1
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
pool_replicate = pool(row, pr_filename, False)
pool_pseudoreplicates_dict[replicate_id] = pool_replicate
design_new_df = design_df
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
if not single_control: if not single_control:
path_to_pool_control = cwd + '/' + pool_control path_to_pool_control = cwd + '/' + pool_control
if control_df.values.max() > cutoff_ratio: if control_df.values.max() > cutoff_ratio:
...@@ -308,7 +268,10 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con ...@@ -308,7 +268,10 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con
path_to_control path_to_control
else: else:
path_to_pool_control = cwd + '/' + pool_control if paired:
path_to_pool_control = cwd + '/' + pool_control
else:
path_to_pool_control = pool_control
design_new_df['control_tag_align'] = path_to_pool_control design_new_df['control_tag_align'] = path_to_pool_control
# Add in pseudo replicates # Add in pseudo replicates
...@@ -347,7 +310,7 @@ def main(): ...@@ -347,7 +310,7 @@ def main():
design_df = pd.read_csv(design, sep='\t') design_df = pd.read_csv(design, sep='\t')
# Get current directory to build paths # Get current directory to build paths
cwd = os.getcwd() cwd = os.getcwd()
# Check Number of replicates and replicates # Check Number of replicates and replicates
no_reps = check_replicates(design_df) no_reps = check_replicates(design_df)
...@@ -357,6 +320,7 @@ def main(): ...@@ -357,6 +320,7 @@ def main():
design_new_df = generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls) design_new_df = generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls)
# Write out new dataframe # Write out new dataframe
experiment_id = design_df.at[0, 'experiment_id']
design_new_df.to_csv(experiment_id + '_ppr.tsv', design_new_df.to_csv(experiment_id + '_ppr.tsv',
header=True, sep='\t', index=False) header=True, sep='\t', index=False)
......
...@@ -103,14 +103,20 @@ def xcor(tag, paired): ...@@ -103,14 +103,20 @@ def xcor(tag, paired):
uncompressed_tag_filename = tag_basename uncompressed_tag_filename = tag_basename
# Subsample tagAlign file # Subsample tagAlign file
number_reads = 15000000 number_reads = 20000000
subsampled_tag_filename = \ subsampled_tag_filename = \
tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000) tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000)
tag_extended = 'cat.tagAlign.gz'
out, err = utils.run_pipe([
"zcat %s %s %s" %
(tag, tag, tag)
], outfile=tag_extended)
steps = [ steps = [
'zcat %s' % (tag), 'zcat %s' % (tag),
'grep -v "chrM"', 'grep -v "chrM"',
'shuf -n %d --random-source=%s' % (number_reads, tag)] 'shuf -n %d --random-source=%s' % (number_reads, tag_extended)]
if paired: if paired:
steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
......
#!/opt/bats/libexec/bats-core/ bats
profile_script="./workflow/scripts/plot_profile.sh"
@test "Test deeptools present" {
source ${profile_script}
run check_tools
}
@test "Test deeptools computeMatrix" {
source ${profile_script}
run compute_matrix test_data/ENCSR238SGC_pooled.fc_signal.bw /project/shared/bicf_workflow_ref/mouse/GRCm38/gencode.vM20.annotation.gtf
FILE=computeMatrix.gz
if [[ -s "$FILE" ]]; then
echo "$FILE exists and not empty"
fi
}
@test "Test deeptools plotProfile" {
source ${profile_script}
run plot_profile computeMatrix.gz
FILE=plotProfile.png
if [[ -s "$FILE" ]]; then
echo "$FILE exists and not empty"
fi
}
...@@ -25,6 +25,10 @@ def test_annotation_singleend(): ...@@ -25,6 +25,10 @@ def test_annotation_singleend():
annotation_file = test_output_path + 'ENCSR238SGC.chipseeker_annotation.tsv' annotation_file = test_output_path + 'ENCSR238SGC.chipseeker_annotation.tsv'
assert os.path.exists(annotation_file) assert os.path.exists(annotation_file)
assert utils.count_lines(annotation_file) >= 149284 assert utils.count_lines(annotation_file) >= 149284
df = pd.read_csv(annotation_file, sep = "\t", header = 0)
print(df.head())
#assert df['symbol'].notna().all()
assert not(df['symbol'].isnull().values.any())
@pytest.mark.pairedend @pytest.mark.pairedend
...@@ -41,4 +45,8 @@ def test_upsetplot_pairedend(): ...@@ -41,4 +45,8 @@ def test_upsetplot_pairedend():
def test_annotation_pairedend(): def test_annotation_pairedend():
annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv' annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv'
assert os.path.exists(annotation_file) assert os.path.exists(annotation_file)
assert utils.count_lines(annotation_file) >= 25494 assert utils.count_lines(annotation_file) >= 25367
df = pd.read_csv(annotation_file, sep = "\t", header = 0)
print(df.head())
#assert df['symbol'].notna().all()
assert not(df['symbol'].isnull().values.any())
...@@ -3,10 +3,29 @@ ...@@ -3,10 +3,29 @@
import pytest import pytest
import os import os
import utils import utils
from io import StringIO
import call_peaks_macs
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/callPeaksMACS/' '/../output/callPeaksMACS/'
test_data_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../test_data/'
@pytest.mark.unit
def test_fragment_length():
experiment = "experiment.tagAlign.gz"
control = "control.tagAlign.gz"
prefix = 'test'
genome_size = 'hs'
chrom_sizes = 'genomefile.txt'
cross_qc = os.path.join(test_data_path, 'test_cross.qc')
with pytest.raises(Exception) as excinfo:
call_peaks_macs.call_peaks_macs(experiment, cross_qc, control, prefix, genome_size, chrom_sizes)
assert str(excinfo.value) == "Error in cross-correlation analysis: ['0', '20', '33']"
@pytest.mark.singleend @pytest.mark.singleend
def test_fc_signal_singleend(): def test_fc_signal_singleend():
...@@ -26,7 +45,7 @@ def test_peaks_xls_singleend(): ...@@ -26,7 +45,7 @@ def test_peaks_xls_singleend():
@pytest.mark.singleend @pytest.mark.singleend
def test_peaks_bed_singleend(): def test_peaks_bed_singleend():
peak_file = test_output_path + 'ENCSR238SGC/1/' + 'ENCLB144FDT.narrowPeak' peak_file = test_output_path + 'ENCSR238SGC/1/' + 'ENCLB144FDT.narrowPeak'
assert utils.count_lines(peak_file) == 227389 assert utils.count_lines(peak_file) == 226738
@pytest.mark.pairedend @pytest.mark.pairedend
...@@ -47,4 +66,4 @@ def test_peaks_xls_pairedend(): ...@@ -47,4 +66,4 @@ def test_peaks_xls_pairedend():
@pytest.mark.pairedend @pytest.mark.pairedend
def test_peaks_bed_pairedend(): def test_peaks_bed_pairedend():
peak_file = test_output_path + 'ENCSR729LGA/2/' + 'ENCLB568IYX.narrowPeak' peak_file = test_output_path + 'ENCSR729LGA/2/' + 'ENCLB568IYX.narrowPeak'
assert utils.count_lines(peak_file) == 113821 assert utils.count_lines(peak_file) == 112631
...@@ -15,7 +15,7 @@ def test_diff_peaks_singleend_single_rep(): ...@@ -15,7 +15,7 @@ def test_diff_peaks_singleend_single_rep():
assert os.path.isdir(test_output_path) == False assert os.path.isdir(test_output_path) == False
@pytest.mark.pairedend @pytest.mark.pairedendskip_true
def test_diff_peaks_pairedend_single_rep(): def test_diff_peaks_pairedend_single_rep():
assert os.path.isdir(test_output_path) == False assert os.path.isdir(test_output_path) == False
...@@ -71,4 +71,4 @@ def test_diffbind_pairedend_single_rep(): ...@@ -71,4 +71,4 @@ def test_diffbind_pairedend_single_rep():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed')) assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed'))
diffbind_file = test_output_path + 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.csv' diffbind_file = test_output_path + 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.csv'
assert os.path.exists(diffbind_file) assert os.path.exists(diffbind_file)
assert utils.count_lines(diffbind_file) >= 66201 assert utils.count_lines(diffbind_file) >= 65124
...@@ -4,6 +4,7 @@ import pytest ...@@ -4,6 +4,7 @@ import pytest
import os import os
import utils import utils
import yaml import yaml
from bs4 import BeautifulSoup
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/multiqcReport/' '/../output/multiqcReport/'
...@@ -21,3 +22,14 @@ def test_software_references_output(): ...@@ -21,3 +22,14 @@ def test_software_references_output():
data_loaded = yaml.load(stream) data_loaded = yaml.load(stream)
assert len(data_loaded['data'].split('<ul>')) == 19 assert len(data_loaded['data'].split('<ul>')) == 19
@pytest.mark.singleend
def test_software_references_html():
multiqc_report = os.path.join(test_output_path, 'multiqc_report.html')
html_file = open(multiqc_report, 'r')
source_code = html_file.read()
multiqc_html = BeautifulSoup(source_code, 'html.parser')
references = multiqc_html.find(id="mqc-module-section-Software_References")
assert references is not None
assert len(references.find_all('ul')) == 18
...@@ -20,4 +20,5 @@ def test_software_versions_output(): ...@@ -20,4 +20,5 @@ def test_software_versions_output():
with open(software_versions, 'r') as stream: with open(software_versions, 'r') as stream:
data_loaded = yaml.load(stream) data_loaded = yaml.load(stream)
assert len(data_loaded['data'].split('<dt>')) == 17 assert len(data_loaded['data'].split('<dt>')) == 18
assert 'Not Run' not in data_loaded['data'].split('<dt>')[17]
...@@ -44,4 +44,10 @@ def test_overlap_peaks_singleend(): ...@@ -44,4 +44,10 @@ def test_overlap_peaks_singleend():
def test_overlap_peaks_pairedend(): def test_overlap_peaks_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.rejected.narrowPeak')) assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.rejected.narrowPeak'))
peak_file = test_output_path + 'ENCSR729LGA.replicated.narrowPeak' peak_file = test_output_path + 'ENCSR729LGA.replicated.narrowPeak'
assert utils.count_lines(peak_file) >= 25758 assert utils.count_lines(peak_file) >= 25657
@pytest.mark.singlecontrol
def test_overlap_peaks_singlecontrol():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR000DXB.rejected.narrowPeak'))
peak_file = test_output_path + 'ENCSR000DXB.replicated.narrowPeak'
assert utils.count_lines(peak_file) >= 35097
#!/usr/bin/env python3
import pytest
import os
import utils
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/experimentQC/'
@pytest.mark.singleend
def test_plot_singleend():
assert os.path.exists(os.path.join(test_output_path, 'plotProfile.png'))
assert os.path.getsize(os.path.join(test_output_path, 'plotProfile.png')) > 0
@pytest.mark.pairedend
def test_plot_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'plotProfile.png'))
assert os.path.getsize(os.path.join(test_output_path, 'plotProfile.png')) > 0
...@@ -5,16 +5,18 @@ import pandas as pd ...@@ -5,16 +5,18 @@ import pandas as pd
from io import StringIO from io import StringIO
import os import os
import pool_and_psuedoreplicate import pool_and_psuedoreplicate
import shutil
test_design_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../test_data/'
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/design/' '/../output/design/'
DESIGN_STRING = """sample_id\ttag_align\txcor\tbiosample\tfactor\ttreatment\treplicate\tcontrol_tag_align DESIGN_STRING = """sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tcontrol_tag_align
A_1\tA_1.bedse.gz\tA_1.cc.qc\tLiver\tH3K27ac\tNone\t1\tB_1.bedse.gz A_1\tA_1.tagAlign.gz\tA_1.bedse.gz\tA_1.cc.qc\tA\tLiver\tH3K27ac\tNone\t1\tB_1\tB_1.bedse.gz
A_2\tA_2.bedse.gz\tA_2.cc.qc\tLiver\tH3K27ac\tNone\t2\tB_2.bedse.gz A_2\tA_2.tagAlign.gz\tA_2.bedse.gz\tA_2.cc.qc\tA\tLiver\tH3K27ac\tNone\t2\tB_2\tB_2.bedse.gz
""" """
@pytest.fixture @pytest.fixture
def design_experiment(): def design_experiment():
design_file = StringIO(DESIGN_STRING) design_file = StringIO(DESIGN_STRING)
...@@ -31,9 +33,12 @@ def design_experiment_2(design_experiment): ...@@ -31,9 +33,12 @@ def design_experiment_2(design_experiment):
@pytest.fixture @pytest.fixture
def design_experiment_3(design_experiment): def design_experiment_3(design_experiment):
# Update second control to be same as first # Drop Replicate A_2
design_experiment.loc[1, 'control_tag_align'] = 'B_1.bedse.gz' design_df = design_experiment.drop(design_experiment.index[1])
return design_experiment # Update to be paired as first
design_df.loc[0, 'control_tag_align'] = 'B_1.bedpe.gz'
design_df.loc[0, 'tag_align'] = 'A_1.bedpe.gz'
return design_df
@pytest.mark.unit @pytest.mark.unit
...@@ -63,9 +68,24 @@ def test_check_controls_single(design_experiment_3): ...@@ -63,9 +68,24 @@ def test_check_controls_single(design_experiment_3):
@pytest.mark.unit @pytest.mark.unit
def test_single_rep(design_experiment_2): def test_single_rep(design_experiment_2):
cwd = os.getcwd() cwd = os.getcwd()
B_1 = open(cwd + 'B_1.bedse.gz') shutil.copy(test_design_path + 'A_1.bedse.gz', cwd)
B_1.close() shutil.copy(test_design_path + 'B_1.bedse.gz', cwd)
shutil.copy(test_design_path + 'A_1.tagAlign.gz', cwd)
shutil.copy(test_design_path + 'B_1.tagAlign.gz', cwd)
single_rep = pool_and_psuedoreplicate.generate_design('false', 1.2, design_experiment_2, cwd, 1, 1) single_rep = pool_and_psuedoreplicate.generate_design('false', 1.2, design_experiment_2, cwd, 1, 1)
assert single_rep.shape[0] == 4
assert len(single_rep['control_tag_align'].unique()) == 2
assert 'pool_control.tagAlign.gz' in single_rep['control_tag_align'].unique()[1]
@pytest.mark.unit
def test_single_control(design_experiment_3):
cwd = os.getcwd()
shutil.copy(test_design_path + 'A_1.bedpe.gz', cwd)
shutil.copy(test_design_path + 'B_1.bedpe.gz', cwd)
shutil.copy(test_design_path + 'A_1.tagAlign.gz', cwd)
single_control = pool_and_psuedoreplicate.generate_design('true', 1.2, design_experiment_3, cwd, 1, 1)
assert 'pool_control.tagAlign.gz' in single_control['control_tag_align'].unique()[0]
@pytest.mark.singleend @pytest.mark.singleend
......
...@@ -17,9 +17,9 @@ def test_cross_plot_singleend(): ...@@ -17,9 +17,9 @@ def test_cross_plot_singleend():
def test_cross_qc_singleend(): def test_cross_qc_singleend():
qc_file = os.path.join(test_output_path,"ENCLB144FDT/ENCLB144FDT.cc.qc") qc_file = os.path.join(test_output_path,"ENCLB144FDT/ENCLB144FDT.cc.qc")
df_xcor = pd.read_csv(qc_file, sep="\t", header=None) df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
assert df_xcor[2].iloc[0] == '190,200,210' assert df_xcor[2].iloc[0] == '185,195,205'
assert df_xcor[8].iloc[0] == 1.025906 assert df_xcor[8].iloc[0] == 1.02454
assert round(df_xcor[9].iloc[0], 6) == 1.139671 assert df_xcor[9].iloc[0] == 0.8098014
@pytest.mark.pairedend @pytest.mark.pairedend
...@@ -31,6 +31,6 @@ def test_cross_qc_pairedend(): ...@@ -31,6 +31,6 @@ def test_cross_qc_pairedend():
def test_cross_plot_pairedend(): def test_cross_plot_pairedend():
qc_file = os.path.join(test_output_path,"ENCLB568IYX/ENCLB568IYX.cc.qc") qc_file = os.path.join(test_output_path,"ENCLB568IYX/ENCLB568IYX.cc.qc")
df_xcor = pd.read_csv(qc_file, sep="\t", header=None) df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
assert df_xcor[2].iloc[0] == '220,430,475' assert df_xcor[2].iloc[0] == '215,225,455'
assert round(df_xcor[8].iloc[0],6) == 1.060018 assert round(df_xcor[8].iloc[0],6) == 1.056201
assert df_xcor[9].iloc[0] == 4.099357 assert df_xcor[9].iloc[0] == 3.599357