diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 79370bd0eda7a5926ba2c01514fbceb9941f944d..a54c5d7d11d173b292452f3906efd258c4b0582b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,30 +1,48 @@ before_script: - module add python/3.6.1-2-anaconda - pip install --user pytest-pythonpath pytest-cov - - module load nextflow/0.27.6 - - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/ + - module load nextflow/0.31.0 + - ln -s /work/BICF/s163035/chipseq/*fastq.gz test_data/ stages: - unit - - integration + - single + - multiple user_configuration: stage: unit script: - pytest -m unit + - pytest -m unit --cov=./workflow/scripts single_end_mouse: - stage: integration + stage: single script: - - nextflow run workflow/main.nf - - pytest -m integration - - pytest --cov=./workflow/scripts + - nextflow run workflow/main.nf -resume + - pytest -m singleend artifacts: expire_in: 2 days paired_end_human: - stage: integration + stage: single script: - - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true + - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true -resume + - pytest -m pairedend + artifacts: + expire_in: 2 days + +single_end_diff: + stage: multiple + script: + - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' -resume + - pytest -m singlediff + artifacts: + expire_in: 2 days + +paired_end_diff: + stage: multiple + script: + - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true -resume + - pytest -m paireddiff artifacts: expire_in: 2 days diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 7db5da7e1e95561cd005a3b725ddc8633fb76e13..929e2316a228ee29b6cd6f8f23627d382b9de224 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -41,16 +41,16 @@ documentation_files: workflow_modules: - 'python/3.6.1-2-anaconda' - 'trimgalore/0.4.1' - - 'cutadapt/1.9.1' - - 'fastqc/0.11.2' - 'bwa/intel/0.7.12' - - 'samtools/1.4.1' - 'sambamba/0.6.6' - 'bedtools/2.26.0' - 'deeptools/2.5.0.1' - 'phantompeakqualtools/1.2' - 'macs/2.1.0-20151222' - 'UCSC_userApps/v317' + - 'R/3.4.1-gccmkl' + - 'meme/4.11.1-gcc-openmpi' + - 'python/2.7.x-anaconda' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: @@ -113,13 +113,14 @@ workflow_parameters: description: | A design file listing sample id, fastq files, corresponding control id and additional information about the sample. + regex: ".*tsv" - id: genome type: select choices: - [ 'GRCh38', 'Human GRCh38'] - [ 'GRCh37', 'Human GRCh37'] - - [ 'GRCh38', 'Mouse GRCh38'] + - [ 'GRCm38', 'Mouse GRCm38'] required: true description: | Reference species and genome used for alignment and subsequent analysis. @@ -133,7 +134,7 @@ workflow_parameters: # The workflow must publish all final output into $baseDir # Name of the R module that the vizapp will run against -vizapp_r_module: 'R/3.2.1-intel' +vizapp_r_module: 'R/3.4.1-gccmkl' # List of any CRAN packages, not provided by the modules, that must be made # available to the vizapp diff --git a/docs/index.md b/docs/index.md index 5bfa0db0ba46fe8c6c191989a86c5dbe8c566d38..b6c66cec174ba4aeca2216c39f18dda3bfa89010 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,35 +1,34 @@ # Astrocyte ChIPseq analysis Workflow Package -This SOP describes the analysis pipeline of downstream analysis of ChIP-seq sequencing data. This pipeline includes (1) Quality control using Deeptools, (2) Peak annotation, (3) Differential peak analysis, and (4) motif analysis. BAM files and SORTED peak BED files selected as input. For each sample this workflow: +## Introduction +**ChIP-seq Analysis** is a bioinformatics best-practice analysis pipeline used for chromatin immunoprecipitation (ChIP-seq) data analysis. - 1) Annotate all peaks using ChipSeeker - 2) Qulity control and signal profiling with Deeptools - 3) Find differential expressed peaks using DiffBind - 4) Annotate all differentially expressed peaks - 5) Using MEME-ChIP in motif finding for both original peaks and differently expressed peaks +The pipeline uses [Nextflow](https://www.nextflow.io), a bioinformatics workflow tool. It pre-processes raw data from FastQ inputs, aligns the reads and performs extensive quality-control on the results. +### Pipeline Steps +1) Trim adaptors TrimGalore! +2) Align with BWA +3) Filter reads with Sambamba S +4) Quality control with DeepTools +5) Calculate Cross-correlation using SPP and PhantomPeakQualTools +6) Signal profiling using MACS2 +7) Call consenus peaks +8) Annotate all peaks using ChipSeeker +9) Use MEME-ChIP to find motifs in original peaks +10) Find differential expressed peaks using DiffBind (If more than 1 experiment) -## Annotations used in the pipeline - - ChipSeeker - Known gene from Bioconductor [TxDb annotation](https://bioconductor.org/packages/release/BiocViews.html#___TxDb) - Deeptools - RefGene downloaded from UCSC Table browser - - - ## Workflow Parameters - bam - Choose all ChIP-seq alignment files for analysis. + reads - Choose all ChIP-seq fastq files for analysis. + pairedEnd - Choose True/False if data is paired-end + design - Choose the file with the experiment design information. TSV format genome - Choose a genomic reference (genome). - peaks - Choose all the peak files for analysis. All peaks should be sorted by the user - design - Choose the file with the experiment design information. CSV format - toppeak - The number of top peaks used for motif analysis. Default is all - ## Design file - + The following columns are necessary, must be named as in template. An design file template can be downloaded [HERE](https://git.biohpc.swmed.edu/bchen4/chipseq_analysis/raw/master/docs/design_example.csv) SampleID @@ -52,7 +51,7 @@ This SOP describes the analysis pipeline of downstream analysis of ChIP-seq sequ The id of the control sample PeakCaller The peak caller used - + ### Credits @@ -64,5 +63,3 @@ This example worklow is derived from original scripts kindly contributed by the * DiffBind: http://bioconductor.org/packages/release/bioc/html/DiffBind.html * Deeptools: https://deeptools.github.io/ * MEME-ChIP: http://meme-suite.org/doc/meme-chip.html - - diff --git a/test_data/design_diff_PE.txt b/test_data/design_diff_PE.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0765823e6396311f55fc53b636e80f290c571e3 --- /dev/null +++ b/test_data/design_diff_PE.txt @@ -0,0 +1,7 @@ +sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 fastq_read2 +ENCLB637LZP ENCSR729LGA MCF-7 SP1 None 1 ENCLB678IDC ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz +ENCLB568IYX ENCSR729LGA MCF-7 SP1 None 2 ENCLB336TVW ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz +ENCLB678IDC ENCSR217LRF MCF-7 Control None 1 ENCLB678IDC ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz +ENCLB336TVW ENCSR217LRF MCF-7 Control None 2 ENCLB336TVW ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz +ENCLB552ACZ ENCSR757EMK MCF-7 SUZ12 None 1 ENCLB678IDC ENCFF833EZX.fastq.gz ENCFF161HBP.fastq.gz +ENCLB872TQR ENCSR757EMK MCF-7 SUZ12 None 2 ENCLB336TVW ENCFF776KZU.fastq.gz ENCFF119KHM.fastq.gz diff --git a/test_data/design_diff_SE.txt b/test_data/design_diff_SE.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5aac58dcafbf113a640940d23fb34d0d1455573 --- /dev/null +++ b/test_data/design_diff_SE.txt @@ -0,0 +1,9 @@ +sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 +ENCLB144FDT ENCSR238SGC limb H3K4me1 None 1 ENCLB304SBJ ENCFF833BLU.fastq.gz +ENCLB831RUI ENCSR238SGC limb H3K4me1 None 2 ENCLB410VVO ENCFF646LXU.fastq.gz +ENCLB304SBJ ENCSR687ALB limb Control None 1 ENCLB304SBJ ENCFF524CAC.fastq.gz +ENCLB410VVO ENCSR687ALB limb Control None 2 ENCLB410VVO ENCFF163AJI.fastq.gz +ENCLB140BPV ENCSR272GNQ midbrain H3K4me1 None 1 ENCLB841FLB ENCFF278VQW.fastq.gz +ENCLB785CNN ENCSR272GNQ midbrain H3K4me1 None 2 ENCLB735ZEL ENCFF466CFM.fastq.gz +ENCLB841FLB ENCSR842LMA midbrain Control None 1 ENCLB841FLB ENCFF914QXH.fastq.gz +ENCLB735ZEL ENCSR842LMA midbrain Control None 2 ENCLB735ZEL ENCFF942UQV.fastq.gz diff --git a/test_data/fetch_test_data.sh b/test_data/fetch_test_data.sh index a6107cdd49252429c01b54313fb64e3479ee5813..8b9a33125f8e55f4f96947961f402ed6f97897c9 100644 --- a/test_data/fetch_test_data.sh +++ b/test_data/fetch_test_data.sh @@ -3,6 +3,11 @@ wget https://www.encodeproject.org/files/ENCFF833BLU/@@download/ENCFF833BLU.fast wget https://www.encodeproject.org/files/ENCFF646LXU/@@download/ENCFF646LXU.fastq.gz wget https://www.encodeproject.org/files/ENCFF524CAC/@@download/ENCFF524CAC.fastq.gz wget https://www.encodeproject.org/files/ENCFF163AJI/@@download/ENCFF163AJI.fastq.gz +echo "Downloading Single-end data set Mouse ENCSR272GNQ and ENCSR842LMA" +wget https://www.encodeproject.org/files/ENCFF278VQW/@@download/ENCFF278VQW.fastq.gz +wget https://www.encodeproject.org/files/ENCFF466CFM/@@download/ENCFF466CFM.fastq.gz +wget https://www.encodeproject.org/files/ENCFF914QXH/@@download/ENCFF914QXH.fastq.gz +wget https://www.encodeproject.org/files/ENCFF942UQV/@@download/ENCFF942UQV.fastq.gz echo "Done with Single-end" echo "Downloading Paired-end data set Human ENCSR729LGA and ENCSR217LRF" @@ -14,4 +19,9 @@ wget https://www.encodeproject.org/files/ENCFF002DTU/@@download/ENCFF002DTU.fast wget https://www.encodeproject.org/files/ENCFF002EFI/@@download/ENCFF002EFI.fastq.gz wget https://www.encodeproject.org/files/ENCFF002EFG/@@download/ENCFF002EFG.fastq.gz wget https://www.encodeproject.org/files/ENCFF002DTS/@@download/ENCFF002DTS.fastq.gz +echo "Downloading Paired-end data set Human ENCSR757EMK" +wget https://www.encodeproject.org/files/ENCFF833EZX/@@download/ENCFF833EZX.fastq.gz +wget https://www.encodeproject.org/files/ENCFF161HBP/@@download/ENCFF161HBP.fastq.gz +wget https://www.encodeproject.org/files/ENCFF776KZU/@@download/ENCFF776KZU.fastq.gz +wget https://www.encodeproject.org/files/ENCFF119KHM/@@download/ENCFF119KHM.fastq.gz echo "Done with Paired-end" diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 30067fa3f0c9972a120e2f15d337334769eb0f40..2c0f1c31a9c46b5f9f45699f3c1eb506d6c62e10 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -4,50 +4,63 @@ process { clusterOptions = '--hold' // Process specific configuration - $checkDesignFile { + withName: checkDesignFile { module = ['python/3.6.1-2-anaconda'] executor = 'local' } - $trimReads { + withName: trimReads { module = ['python/3.6.1-2-anaconda', 'trimgalore/0.4.1'] cpus = 32 } - $alignReads{ + withName: alignReads{ module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/1.6'] - cpus = 32 + queue = '128GB,256GB,256GBv1' } - $filterReads{ + withName: filterReads{ module = ['python/3.6.1-2-anaconda', 'samtools/1.6', 'sambamba/0.6.6', 'bedtools/2.26.0'] - cpus = 32 + queue = '128GB,256GB,256GBv1' } - $experimentQC { + withName: experimentQC { module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1'] - cpus = 32 + queue = '128GB,256GB,256GBv1' } - $convertReads { + withName: convertReads { module = ['python/3.6.1-2-anaconda', 'samtools/1.6', 'bedtools/2.26.0'] - cpus = 32 + queue = '128GB,256GB,256GBv1' } - $crossReads { + withName: crossReads { module = ['python/3.6.1-2-anaconda', 'phantompeakqualtools/1.2'] cpus = 32 } - $defineExpDesignFiles { + withName: defineExpDesignFiles { module = ['python/3.6.1-2-anaconda'] executor = 'local' } - $poolAndPsuedoReads { + withName: poolAndPsuedoReads { module = ['python/3.6.1-2-anaconda'] executor = 'local' } - $callPeaksMACS { + withName: callPeaksMACS { module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'phantompeakqualtools/1.2', 'UCSC_userApps/v317', 'bedtools/2.26.0'] - cpus = 32 + queue = '128GB,256GB,256GBv1' } - $consensusPeaks { + withName: consensusPeaks { module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0'] executor = 'local' } + withName: peakAnnotation { + module = ['R/3.3.2-gccmkl'] + executor = 'local' + } + withName: diffPeaks { + module = ['R/3.3.2-gccmkl'] + cpus = 32 + } + withName: motifSearch { + module = ['python/3.6.1-2-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0'] + cpus = 32 + } + } params { @@ -57,16 +70,19 @@ params { bwa = '/project/shared/bicf_workflow_ref/GRCh38' genomesize = 'hs' chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt' + fasta = '/project/shared/bicf_workflow_ref/GRCh38/genome.fa' } 'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' genomesize = 'hs' chromsizes = '/project/shared/bicf_workflow_ref/GRCh37/genomefile.txt' + fasta = '/project/shared/bicf_workflow_ref/GRCh37/genome.fa' } 'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' genomesize = 'mm' chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt' + fasta = '/project/shared/bicf_workflow_ref/GRCm38/genome.fa' } } } diff --git a/workflow/main.nf b/workflow/main.nf index 95a0aea9be9d3bae86b5d66a7ca96ce260f69ba5..29afb260c8dee81e663e4031fbf6625398eaa321 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -12,9 +12,11 @@ params.genomes = [] params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false +params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.cutoffRatio = 1.2 params.outDir= "$baseDir/output" params.extendReadsLen = 100 +params.topPeakCount = 600 // Check inputs if( params.bwaIndex ){ @@ -36,11 +38,15 @@ readsList = Channel pairedEnd = params.pairedEnd designFile = params.designFile genomeSize = params.genomeSize +genome = params.genome chromSizes = params.chromSizes +fasta = params.fasta cutoffRatio = params.cutoffRatio outDir = params.outDir extendReadsLen = params.extendReadsLen +topPeakCount = params.topPeakCount +// Check design file for errors process checkDesignFile { publishDir "$outDir/design", mode: 'copy' @@ -372,7 +378,9 @@ process consensusPeaks { file '*.replicated.*' into consensusPeaks file '*.rejected.*' into rejectedPeaks - file("design_diffPeaks.tsv") into designDiffPeaks + file 'design_diffPeaks.csv' into designDiffPeaks + file 'design_annotatePeaks.tsv' into designAnnotatePeaks, designMotifSearch + file 'unique_experiments.csv' into uniqueExperiments script: @@ -381,3 +389,76 @@ process consensusPeaks { """ } + +// Annotate Peaks +process peakAnnotation { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file designAnnotatePeaks + + output: + + file "*chipseeker*" into peakAnnotation + + script: + + """ + Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $genome + """ + +} + +// Motif Search Peaks +process motifSearch { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file designMotifSearch + + output: + + file "*memechip" into motifSearch + file "sorted-*" into filteredPeaks + + script: + + """ + python3 $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount + """ +} + +// Define channel to find number of unique experiments +uniqueExperimentsList = uniqueExperiments + .splitCsv(sep: '\t', header: true) + +// Calculate Differential Binding Activity +process diffPeaks { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file designDiffPeaks + val noUniqueExperiments from uniqueExperimentsList.count() + + output: + + file '*_diffbind.bed' into diffPeaks + file '*_diffbind.csv' into diffPeaksCounts + file '*.pdf' into diffPeaksStats + file 'normcount_peaksets.txt' into normCountPeaks + + when: + noUniqueExperiments > 1 + + + script: + """ + Rscript $baseDir/scripts/diff_peaks.R $designDiffPeaks + """ +} diff --git a/workflow/scripts/annotate_peaks.R b/workflow/scripts/annotate_peaks.R new file mode 100644 index 0000000000000000000000000000000000000000..c15ba698dc3f9d120024426b4d7f57bd24a741a6 --- /dev/null +++ b/workflow/scripts/annotate_peaks.R @@ -0,0 +1,74 @@ +#!/bin/Rscript + +# Load libraries +library("ChIPseeker") + +# Currently mouse or human + +library("TxDb.Hsapiens.UCSC.hg19.knownGene") +library("TxDb.Mmusculus.UCSC.mm10.knownGene") +library("TxDb.Hsapiens.UCSC.hg38.knownGene") +library("org.Hs.eg.db") +library("org.Mm.eg.db") + + +# Create parser object +args <- commandArgs(trailingOnly=TRUE) + +# Check input args +if (length(args) != 2) { + stop("Usage: annotate_peaks.R [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE) +} + +design_file <- args[1] +genome_assembly <- args[2] + +# Load UCSC Known Genes +if(genome_assembly=='GRCh37') { + txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene + annodb <- 'org.Hs.eg.db' +} else if(genome_assembly=='GRCm38') { + txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene + annodb <- 'org.Mm.eg.db' +} else if(genome_assembly=='GRCh38') { + txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene + annodb <- 'org.Hs.eg.db' +} + +# Load design file +design <- read.csv(design_file, sep ='\t') +files <- as.list(as.character(design$Peaks)) +names(files) <- design$Condition + +# Granges of files + +peaks <- lapply(files, readPeakFile, as = "GRanges", header = FALSE) +peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, annoDb=annodb, tssRegion=c(-3000, 3000), verbose=FALSE) + +column_names <- c("chr", "start", "end", "width", "strand_1", "name", "score", "strand", "signalValue", + "pValue", "qValue", "peak", "annotation", "geneChr", "geneStart", "geneEnd", + "geneLength" ,"geneStrand", "geneId", "transcriptId", "distanceToTSS", + "ENSEMBL", "symbol", "geneName") + +for(index in c(1:length(peakAnnoList))) { + filename <- paste(names(peaks)[index], ".chipseeker_annotation.tsv", sep="") + df <- as.data.frame(peakAnnoList[[index]]) + colnames(df) <- column_names + write.table(df[ , !(names(df) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F) + + # Draw individual plots + + # Define names of Plots + pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="") + upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="") + + # Pie Plots + pdf(pie_name) + plotAnnoPie(peakAnnoList[[index]]) + dev.off() + + # Upset Plot + pdf(upsetplot_name, onefile=F) + upsetplot(peakAnnoList[[index]]) + dev.off() +} diff --git a/workflow/scripts/runDiffBind.R b/workflow/scripts/diff_peaks.R similarity index 62% rename from workflow/scripts/runDiffBind.R rename to workflow/scripts/diff_peaks.R index f4c7a6bc6bcc47e56b6a72ffde2109c362297d21..76caa95495c7cae1d014530e78e1e1e3c4af09dc 100644 --- a/workflow/scripts/runDiffBind.R +++ b/workflow/scripts/diff_peaks.R @@ -1,45 +1,50 @@ +#!/bin/Rscript + +# Load libraries library("DiffBind") -#build dba object from sample sheet and do analysis -args <- commandArgs(TRUE) +# Create parser object +args <- commandArgs(trailingOnly=TRUE) + +# Check input args +if (length(args) != 1) { + stop("Usage: diff_peaks.R [ annotate_design.tsv ] ", call.=FALSE) +} + +# Build DBA object from design file data <- dba(sampleSheet=args[1]) data <- dba.count(data) data <- dba.contrast(data, minMembers = 2, categories=DBA_CONDITION) data <- dba.analyze(data) -#Draw figures -pdf("diffbind.samples.heatmap.pdf") +# Draw figures +pdf("heatmap.pdf") plot(data) dev.off() -pdf("diffbind.samples.pca.pdf") +pdf("pca.pdf") dba.plotPCA(data, DBA_TISSUE, label=DBA_CONDITION) dev.off() -#Save peak reads count +# Save peak reads count normcount <- dba.peakset(data, bRetrieve=T) -write.table(as.data.frame(normcount),"diffbind.normcount.txt",sep="\t",quote=F,row.names=F) +write.table(as.data.frame(normcount),"normcount_peaksets.txt",sep="\t",quote=F,row.names=F) -#Retriving the differentially bound sites -#make new design file for chipseeker at the same time +# Reteriving the differentially bound sites +# Make new design file for peakAnnotation at the same time new_SampleID = c() new_Peaks = c() -for (i in c(1:length(data$contrasts))) -{ +for (i in c(1:length(data$contrasts))) { contrast_bed_name = paste(data$contrasts[[i]]$name1,"vs", data$contrasts[[i]]$name2,"diffbind.bed",sep="_") contrast_name = paste(data$contrasts[[i]]$name1,"vs", - data$contrasts[[i]]$name2,"diffbind.xls",sep="_") + data$contrasts[[i]]$name2,"diffbind.csv",sep="_") new_SampleID = c(new_SampleID,paste(data$contrasts[[i]]$name1,"vs",data$contrasts[[i]]$name2,sep="_")) new_Peaks = c(new_Peaks, contrast_bed_name) report <- dba.report(data, contrast=i, th=1, bCount=TRUE) report <- as.data.frame(report) - print(head(report)) colnames(report)[1:5]<-c("chrom","peak_start","peak_stop","peak_width","peak_strand") - #print(head(report)) + write.table(report,contrast_name,sep="\t",quote=F,row.names=F) write.table(report[,c(1:3)],contrast_bed_name,sep="\t",quote=F,row.names=F, col.names=F) } -#Write new design file -newdesign = data.frame(SampleID=new_SampleID, Peaks=new_Peaks) -write.csv(newdesign,"diffpeak.design",row.names=F,quote=F) diff --git a/workflow/scripts/motif_search.py b/workflow/scripts/motif_search.py new file mode 100644 index 0000000000000000000000000000000000000000..9feac8c39201af96700f69370a70b24a7111a5fc --- /dev/null +++ b/workflow/scripts/motif_search.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +'''Call Motifs on called peaks.''' + +import os +import sys +import re +from re import sub +import string +import argparse +import logging +import subprocess +import pandas as pd +import utils +from multiprocessing import Pool + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to run motif search.", + required=True) + + parser.add_argument('-g', '--genome', + help="The genome FASTA file.", + required=True) + + parser.add_argument('-p', '--peak', + help="The number of peaks to use.", + required=True) + + args = parser.parse_args() + return args + +# Functions + +def run_wrapper(args): + motif_search(*args) + +def motif_search(filename, genome, experiment, peak): + + file_basename = os.path.basename(filename) + sorted_fn = 'sorted-%s' % (file_basename) + out_fa = '%s.fa' % (experiment) + out_motif = '%s_memechip' % (experiment) + + # Sort Bed file and limit number of peaks + if peak == -1: + peak = utils.count_lines(filename) + + out, err = utils.run_pipe([ + 'sort -k %dgr,%dgr %s' % (5, 5, filename), + 'head -n %s' % (peak)], outfile=sorted_fn) + + # Get fasta file + out, err = utils.run_pipe([ + 'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)]) + + #Call memechip + out, err = utils.run_pipe([ + 'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)]) + +def main(): + args = get_args() + design = args.design + genome = args.genome + peak = args.peak + + # Read files + design_df = pd.read_csv(design, sep='\t') + + meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0]) + work_pool = Pool(min(12,design_df.shape[0])) + return_list = work_pool.map(run_wrapper, meme_arglist) + work_pool.close() + work_pool.join() + +if __name__=="__main__": + main() diff --git a/workflow/scripts/overlap_peaks.py b/workflow/scripts/overlap_peaks.py index 379f8841c205b70a664002d6ac6ddae3917928d7..434caa08799a2d6b3bf460ea8520d08a5b92c852 100644 --- a/workflow/scripts/overlap_peaks.py +++ b/workflow/scripts/overlap_peaks.py @@ -166,7 +166,7 @@ def overlap(experiment, design): os.remove(overlap_tr_fn) os.remove(overlap_pr_fn) - return overlapping_peaks_fn + return os.path.abspath(overlapping_peaks_fn) def main(): @@ -182,15 +182,20 @@ def main(): design_peaks_df = pd.read_csv(design, sep='\t') design_files_df = pd.read_csv(files, sep='\t') - # Make a design file for + # Make a design file for differential binding design_diff = update_design(design_files_df) + # Make a design file for annotating Peaks + anno_cols = ['Condition', 'Peaks'] + design_anno = pd.DataFrame(columns = anno_cols) + # Find consenus overlap peaks for each experiment for experiment, df_experiment in design_peaks_df.groupby('experiment_id'): replicated_peak = overlap(experiment, df_experiment) design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak + design_anno.loc[experiment] = [experiment, replicated_peak] - # Write out file + # Write out design files design_diff.columns = ['SampleID', 'bamReads', 'Condition', @@ -203,7 +208,12 @@ def main(): 'Peaks', 'PeakCaller'] - design_diff.to_csv("design_diffPeaks.tsv", header=True, sep='\t', index=False) + design_diff.to_csv("design_diffPeaks.csv", header=True, sep=',', index=False) + design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False) + + # Write the unique conditions + unique_experiments = pd.DataFrame(design_diff['Condition'].unique().tolist(), columns=['Condition']) + unique_experiments.to_csv('unique_experiments.csv', index=False) if __name__ == '__main__': diff --git a/workflow/scripts/runChipseeker.R b/workflow/scripts/runChipseeker.R deleted file mode 100644 index b20df7833b2ee7bcd65dec1908712f941bf8479a..0000000000000000000000000000000000000000 --- a/workflow/scripts/runChipseeker.R +++ /dev/null @@ -1,56 +0,0 @@ -args = commandArgs(trailingOnly=TRUE) -#if (length(args)==0) { -# stop("At least one argument must be supplied (input file).n", call.=FALSE) -#} else if (length(args)==1) { -# # default output file -# args[3] = "out.txt" -#} - -library(ChIPseeker) -#Parse the genome path and get genome version -path_elements = unlist(strsplit(args[2],"[/]")) -genome = path_elements[length(path_elements)] - -if(genome=="GRCh37") -{ -library(TxDb.Hsapiens.UCSC.hg19.knownGene) -txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene -} -if(genome=="GRCm38") -{ -library(TxDb.Mmusculus.UCSC.mm10.knownGene) -txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene -} -if(genome=="GRCh38") -{ -library(TxDb.Hsapiens.UCSC.hg38.knownGene) -txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene -} - -design<-read.csv(args[1]) -files<-as.list(as.character(design$Peaks)) -names(files)<-design$SampleID - - -peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE) -for(index in c(1:length(peakAnnoList))) -{ - filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="") - write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F) - #draw individual plot - pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="") - vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="") - upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="") - pdf(pie_name) - plotAnnoPie(peakAnnoList[[index]]) - dev.off() - pdf(vennpie_name) - vennpie(peakAnnoList[[index]]) - dev.off() - pdf(upsetplot_name) - upsetplot(peakAnnoList[[index]]) - dev.off() - - -} - diff --git a/workflow/scripts/runMemechip.py b/workflow/scripts/runMemechip.py deleted file mode 100644 index b1f848f0376a449329594d1fe3c67b3b0e454c84..0000000000000000000000000000000000000000 --- a/workflow/scripts/runMemechip.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/qbrc/software/Python-2.7.7/bin/python -# programmer : bbc -# usage: - -import sys -import re -from re import sub -import string -import argparse as ap -import logging -import twobitreader -import subprocess -import pandas as pd -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord -from multiprocessing import Pool -logging.basicConfig(level=10) - - -def prepare_argparser(): - description = "Run memechip command" - epilog = "For command line options of each command, type %(prog)% COMMAND -h" - argparser = ap.ArgumentParser(description=description, epilog = epilog) - argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BED file") -# argparser.add_argument("-o","--output",dest = "outfile",type=str,required=True, help="output") - argparser.add_argument("-g","--genome",dest = "genome",type=str, help="Genome 2 bit file") - # argparser.add_argument("-m","--mask",dest = "mask",type=bool,default=False, help="Convert repeats to N") - argparser.add_argument("-l","--limit",dest = "limit",type=int,default=-1, help="Top limit of peaks") - return(argparser) - -def rc(seq): - comps = {'A':"T",'C':"G",'G':"C",'T':"A","N":"N"} - return ''.join([comps[x] for x in seq.upper()[::-1]]) - -def main(): - argparser = prepare_argparser() - args = argparser.parse_args() - #run(args.infile, args.genome, args.limit, args.output) - #get Pool ready - dfile = pd.read_csv(args.infile) - meme_arglist = zip(dfile['Peaks'].tolist(),[args.genome+"/genome.2bit"]*dfile.shape[0],[str(args.limit)]*dfile.shape[0],dfile['SampleID'].tolist()) - work_pool = Pool(min(12,dfile.shape[0])) - resultList = work_pool.map(run_wrapper, meme_arglist) - work_pool.close() - work_pool.join() - - -def run_wrapper(args): - run(*args) - - -def run(infile, genome, limit, output): - infile = open(infile).readlines() - logging.debug(len(infile)) - genome = twobitreader.TwoBitFile(genome) - outfile = open(output+".fa","w") - rowcount = 0 - limit = int(limit) - logging.debug(limit) - if limit < 0: - limit = len(infile) - for record in infile: - rowcount += 1 - #logging.debug(record) - if rowcount <=limit: - seqbuf = record.rstrip().split("\t") - try: - #logging.debug(record.chrom) - seq = genome[seqbuf[0]][int(seqbuf[1]):int(seqbuf[2])] - except: - pass - else: - if len(seqbuf)>=5: - if seqbuf[5] == "-": - seq = rc(seq) - if len(seqbuf)>=4: - newfa_name = seqbuf[3] - else: - newfa_name = "_".join(seqbuf) - newfa = SeqRecord(Seq(seq),newfa_name,description="") - #logging.debug(seq) - SeqIO.write(newfa,outfile,"fasta") - outfile.close() - #Call memechip - meme_command = "meme-chip -oc "+output+"_memechip"+" -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 "+output+".fa" - p = subprocess.Popen(meme_command, shell=True) - p.communicate() - -if __name__=="__main__": - main() diff --git a/workflow/tests/test_annotate_peaks.py b/workflow/tests/test_annotate_peaks.py new file mode 100644 index 0000000000000000000000000000000000000000..692ddced7762a9a9742c6bf51e652673b62bdd1f --- /dev/null +++ b/workflow/tests/test_annotate_peaks.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os +import utils + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/peakAnnotation/' + + +@pytest.mark.singleend +def test_annotate_peaks_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_pie.pdf')) + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_upsetplot.pdf')) + annotation_file = test_output_path + 'ENCSR238SGC.chipseeker_annotation.tsv' + assert os.path.exists(annotation_file) + assert utils.count_lines(annotation_file) == 152840 + + +@pytest.mark.pairedend +def test_annotate_peaks_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_pie.pdf')) + assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_upsetplot.pdf')) + annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv' + assert os.path.exists(annotation_file) + assert utils.count_lines(annotation_file) == 25391 diff --git a/workflow/tests/test_call_peaks_macs.py b/workflow/tests/test_call_peaks_macs.py index 6182f24ed407eac85dcbaf155272b6f67b6b3a5a..71358bc045801d23d28b8f7385c5ef9f608347b6 100644 --- a/workflow/tests/test_call_peaks_macs.py +++ b/workflow/tests/test_call_peaks_macs.py @@ -8,15 +8,17 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/callPeaksMACS/' -@pytest.mark.integration +@pytest.mark.singleend def test_call_peaks_macs_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.fc_signal.bw')) assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.pvalue_signal.bw')) peak_file = test_output_path + 'ENCLB144FDT_peaks.narrowPeak' - assert utils.count_lines(peak_file) == 210349 + assert utils.count_lines(peak_file) == 227389 -@pytest.mark.integration +@pytest.mark.pairedend def test_call_peaks_macs_pairedend(): - # Do the same thing for paired end data - pass + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.fc_signal.bw')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.pvalue_signal.bw')) + peak_file = test_output_path + 'ENCLB568IYX_peaks.narrowPeak' + assert utils.count_lines(peak_file) == 112652 diff --git a/workflow/tests/test_convert_reads.py b/workflow/tests/test_convert_reads.py index 1067547943b54cce1d4ae2cf10f03517f342c2bd..ff8a003bfc91f5778648b238a2d94a1e04c366cc 100644 --- a/workflow/tests/test_convert_reads.py +++ b/workflow/tests/test_convert_reads.py @@ -7,14 +7,13 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/convertReads/' -@pytest.mark.integration +@pytest.mark.singleend def test_convert_reads_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.tagAlign.gz')) assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bedse.gz')) -@pytest.mark.integration +@pytest.mark.pairedend def test_map_qc_pairedend(): - # Do the same thing for paired end data - # Also check that bedpe exists - pass + assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.tagAlign.gz')) + assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bedpe.gz')) diff --git a/workflow/tests/test_diff_peaks.py b/workflow/tests/test_diff_peaks.py new file mode 100644 index 0000000000000000000000000000000000000000..1d48b6182efd600400b8d2032732dfbb62b704e7 --- /dev/null +++ b/workflow/tests/test_diff_peaks.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os +import utils + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/diffPeaks/' + + +@pytest.mark.singleend +def test_diff_peaks_singleend_single_rep(): + assert os.path.isdir(test_output_path) == False + +@pytest.mark.pairedend +def test_annotate_peaks_pairedend_single_rep(): + assert os.path.isdir(test_output_path) == False + +@pytest.mark.singlediff +def test_diff_peaks_singleend_multiple_rep(): + assert os.path.exists(os.path.join(test_output_path, 'heatmap.pdf')) + assert os.path.exists(os.path.join(test_output_path, 'pca.pdf')) + assert os.path.exists(os.path.join(test_output_path, 'normcount_peaksets.txt')) + assert os.path.exists(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed')) + diffbind_file = test_output_path + 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.csv' + assert os.path.exists(diffbind_file) + assert utils.count_lines(diffbind_file) == 201039 + +@pytest.mark.paireddiff +def test_annotate_peaks_pairedend_single_rep(): + assert os.path.exists(os.path.join(test_output_path, 'heatmap.pdf')) + assert os.path.exists(os.path.join(test_output_path, 'pca.pdf')) + assert os.path.exists(os.path.join(test_output_path, 'normcount_peaksets.txt')) + assert os.path.exists(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed')) + diffbind_file = test_output_path + 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.csv' + assert os.path.exists(diffbind_file) + assert utils.count_lines(diffbind_file) == 66201 diff --git a/workflow/tests/test_experiment_design.py b/workflow/tests/test_experiment_design.py index 51ada8680d4f5d74881715f64ad7137301ea592f..3efcca0907a00263513c966c6f465dfb56aa0ee5 100644 --- a/workflow/tests/test_experiment_design.py +++ b/workflow/tests/test_experiment_design.py @@ -30,15 +30,17 @@ def test_check_update_controls_tag(design_tag): assert new_design.loc[0, 'control_tag_align'] == "B_1.tagAlign.gz" -@pytest.mark.integration -def test_experiment_design_single_end(): +@pytest.mark.singleend +def test_experiment_design_singleend(): design_file = os.path.join(test_output_path, 'ENCSR238SGC.tsv') assert os.path.exists(design_file) design_df = pd.read_csv(design_file, sep="\t") assert design_df.shape[0] == 2 -@pytest.mark.integration -def test_experiment_design_paired_end(): - # Do the same thing for paired end data - pass +@pytest.mark.pairedend +def test_experiment_design_pairedend(): + design_file = os.path.join(test_output_path, 'ENCSR729LGA.tsv') + assert os.path.exists(design_file) + design_df = pd.read_csv(design_file, sep="\t") + assert design_df.shape[0] == 2 diff --git a/workflow/tests/test_experiment_qc.py b/workflow/tests/test_experiment_qc.py index 308ebe2acc432740305e17c3cefef65d8ea105b6..5256da5fbebc424ea29a1977d9257bfc95060ae3 100644 --- a/workflow/tests/test_experiment_qc.py +++ b/workflow/tests/test_experiment_qc.py @@ -30,7 +30,7 @@ def test_check_update_controls(design_bam): assert new_design.loc[0, 'control_reads'] == "B_1.bam" -@pytest.mark.integration +@pytest.mark.singleend def test_experiment_qc_singleend(): assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png')) @@ -38,7 +38,7 @@ def test_experiment_qc_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.png')) assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.png')) -@pytest.mark.integration +@pytest.mark.pairdend def test_experiment_qc_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png')) diff --git a/workflow/tests/test_map_qc.py b/workflow/tests/test_map_qc.py index a882e53a0fc45ee9fa9d501b5160e73ebd2816c5..95841df0aa2e6150da1136cdb295eb93651c5a18 100644 --- a/workflow/tests/test_map_qc.py +++ b/workflow/tests/test_map_qc.py @@ -8,7 +8,7 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/filterReads/' -@pytest.mark.integration +@pytest.mark.singleend def test_map_qc_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam')) assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai')) @@ -23,7 +23,16 @@ def test_map_qc_singleend(): assert df_library_complexity["PBC2"].iloc[0] == 13.706885 -@pytest.mark.integration +@pytest.mark.pairedend def test_map_qc_pairedend(): - # Do the same thing for paired end data - pass + assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bam')) + assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bam.bai')) + filtered_reads_report = test_output_path + 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.flagstat.qc' + samtools_report = open(filtered_reads_report).readlines() + assert '47389080 + 0 in total' in samtools_report[0] + assert '47389080 + 0 mapped (100.00%:N/A)' in samtools_report[4] + library_complexity = test_output_path + 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.pbc.qc' + df_library_complexity = pd.read_csv(library_complexity, sep='\t') + assert df_library_complexity["NRF"].iloc[0] == 0.947064 + assert df_library_complexity["PBC1"].iloc[0] == 0.946724 + assert df_library_complexity["PBC2"].iloc[0] == 18.643039 diff --git a/workflow/tests/test_map_reads.py b/workflow/tests/test_map_reads.py index 5c2ff2be2e5ec0e3675ea3713790743fc3aff344..328858216abb88528c2d25e06bce20d7d469f1cb 100644 --- a/workflow/tests/test_map_reads.py +++ b/workflow/tests/test_map_reads.py @@ -7,7 +7,7 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/alignReads/' -@pytest.mark.integration +@pytest.mark.singleend def test_map_reads_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.srt.bam')) aligned_reads_report = test_output_path + 'ENCFF646LXU.srt.bam.flagstat.qc' @@ -16,7 +16,11 @@ def test_map_reads_singleend(): assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4] -@pytest.mark.integration +@pytest.mark.pairedend def test_map_reads_pairedend(): - # Do the same thing for paired end data - pass + assert os.path.exists(os.path.join(test_output_path, 'ENCFF002DTU_val_1ENCFF002EFI_val_2.srt.bam')) + aligned_reads_report = test_output_path + 'ENCFF002DTU_val_1ENCFF002EFI_val_2.srt.bam.flagstat.qc' + samtools_report = open(aligned_reads_report).readlines() + assert '72660890 + 0 in total' in samtools_report[0] + assert '72053925 + 0 mapped (99.16% : N/A)' in samtools_report[4] + assert '71501126 + 0 properly paired (98.40% : N/A)' in samtools_report[8] diff --git a/workflow/tests/test_motif_search.py b/workflow/tests/test_motif_search.py new file mode 100644 index 0000000000000000000000000000000000000000..ca84f4a60745e5f9080ba40fe148787b5bd740cf --- /dev/null +++ b/workflow/tests/test_motif_search.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os +import utils + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/motifSearch/' + + +@pytest.mark.singleend +def test_motif_search_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC_memechip', 'ENCSR238SGC.fa')) + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC_memechip', 'index.html')) + peak_file_ENCSR238SGC = test_output_path + 'sorted-ENCSR238SGC.replicated.narrowPeak' + assert os.path.exists(peak_file_ENCSR238SGC) + assert utils.count_lines(peak_file_ENCSR238SGC) == 600 + +@pytest.mark.pairedend +def test_motif_search_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_memechip', 'ENCSR729LGA.fa')) + assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_memechip', 'index.html')) + peak_file_ENCSR729LGA= test_output_path + 'sorted-ENCSR729LGA.replicated.narrowPeak' + assert os.path.exists(peak_file_ENCSR729LGA) + assert utils.count_lines(peak_file_ENCSR729LGA) == 600 diff --git a/workflow/tests/test_overlap_peaks.py b/workflow/tests/test_overlap_peaks.py index f75ebcd43d45c18b1418aab0c7340122e2a8ff06..99d43b87939617c801bad0389f14e2683cde0f82 100644 --- a/workflow/tests/test_overlap_peaks.py +++ b/workflow/tests/test_overlap_peaks.py @@ -33,14 +33,15 @@ def test_check_update_design(design_diff): assert new_design.loc[0, 'peak_caller'] == "bed" -@pytest.mark.integration +@pytest.mark.singleend def test_overlap_peaks_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.rejected.narrowPeak')) peak_file = test_output_path + 'ENCSR238SGC.replicated.narrowPeak' - assert utils.count_lines(peak_file) == 150302 + assert utils.count_lines(peak_file) == 152848 -@pytest.mark.integration -def test_call_peaks_macs_pairedend(): - # Do the same thing for paired end data - pass +@pytest.mark.pairedend +def test_overlap_peaks_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.rejected.narrowPeak')) + peak_file = test_output_path + 'ENCSR729LGA.replicated.narrowPeak' + assert utils.count_lines(peak_file) == 25655 diff --git a/workflow/tests/test_pool_and_psuedoreplicate.py b/workflow/tests/test_pool_and_psuedoreplicate.py index 6c30f7e1bb5bbaf3aa21b46bc3e54c1f0e162163..31fffc57e7eaa598a933c97c92b1c901ba731ba7 100644 --- a/workflow/tests/test_pool_and_psuedoreplicate.py +++ b/workflow/tests/test_pool_and_psuedoreplicate.py @@ -60,15 +60,17 @@ def test_check_controls_single(design_experiment_3): assert no_controls == 1 -@pytest.mark.integration -def test_pool_and_psuedoreplicate_single_end(): +@pytest.mark.singleend +def test_pool_and_psuedoreplicate_singleend(): design_file = os.path.join(test_output_path, 'ENCSR238SGC_ppr.tsv') assert os.path.exists(design_file) design_df = pd.read_csv(design_file, sep="\t") assert design_df.shape[0] == 5 -@pytest.mark.integration -def test_experiment_design_paired_end(): - # Do the same thing for paired end data - pass +@pytest.mark.pairedend +def test_experiment_design_pairedend(): + design_file = os.path.join(test_output_path, 'ENCSR729LGA_ppr.tsv') + assert os.path.exists(design_file) + design_df = pd.read_csv(design_file, sep="\t") + assert design_df.shape[0] == 5 diff --git a/workflow/tests/test_trim_reads.py b/workflow/tests/test_trim_reads.py index 92f4716b21ac7ad3cde599ee08edf34f095e70e6..8e2176d7044392f9ed9e00801b3e14d1c627874d 100644 --- a/workflow/tests/test_trim_reads.py +++ b/workflow/tests/test_trim_reads.py @@ -10,7 +10,7 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ -@pytest.mark.integration +@pytest.mark.singleend def test_trim_reads_singleend(): raw_fastq = test_data_path + 'ENCFF833BLU.fastq.gz' trimmed_fastq = test_output_path + 'ENCFF833BLU_trimmed.fq.gz' @@ -21,7 +21,12 @@ def test_trim_reads_singleend(): assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4] -@pytest.mark.integration +@pytest.mark.pairedend def test_trim_reads_pairedend(): - # Do the same thing for paired end data - pass + raw_fastq = test_data_path + 'ENCFF582IOZ.fastq.gz' + trimmed_fastq = test_output_path + 'ENCFF582IOZ_val_2.fq.gz' + trimmed_fastq_report = test_output_path + \ + 'ENCFF582IOZ.fastq.gz_trimming_report.txt' + assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq) + assert os.path.getsize(trimmed_fastq) == 2229312710 + assert 'Trimming mode: paired-end' in open(trimmed_fastq_report).readlines()[4] diff --git a/workflow/tests/test_xcor.py b/workflow/tests/test_xcor.py index cf4840850e4b236e853882a3820e456259c7b17a..2d3fdcff16d3a42c5af2e7c4beb6f777e0ddbba0 100644 --- a/workflow/tests/test_xcor.py +++ b/workflow/tests/test_xcor.py @@ -8,17 +8,21 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/crossReads/' -@pytest.mark.integration -def test_convert_reads_singleend(): +@pytest.mark.singleend +def test_cross_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf')) qc_file = os.path.join(test_output_path,"ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc") df_xcor = pd.read_csv(qc_file, sep="\t", header=None) - assert df_xcor[2].iloc[0] == '195,215,230' - assert df_xcor[8].iloc[0] == 1.024836 - assert df_xcor[9].iloc[0] == 1.266678 + assert df_xcor[2].iloc[0] == '190,200,210' + assert df_xcor[8].iloc[0] == 1.025906 + assert round(df_xcor[9].iloc[0], 6) == 1.139671 -@pytest.mark.integration -def test_map_qc_pairedend(): - # Do the same thing for paired end data - pass +@pytest.mark.pairedend +def test_cross_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCFF582IOZ_val_2ENCFF957SQS_val_1.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf')) + qc_file = os.path.join(test_output_path,"ENCFF582IOZ_val_2ENCFF957SQS_val_1.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc") + df_xcor = pd.read_csv(qc_file, sep="\t", header=None) + assert df_xcor[2].iloc[0] == '210,220,475' + assert round(df_xcor[8].iloc[0],6) == 1.062032 + assert df_xcor[9].iloc[0] == 3.737722