Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • BICF/Astrocyte/chipseq_analysis
  • s190984/chipseq_analysis
  • astrocyte/workflows/bicf/chipseq_analysis
  • s219741/chipseq-analysis-containerized
Show changes
Showing
with 476 additions and 186 deletions
workflow/conf/bicf_logo.png

24.3 KiB

......@@ -2,6 +2,7 @@ process {
executor = 'slurm'
queue = 'super'
clusterOptions = '--hold'
beforeScript= 'ulimit -Ss unlimited'
// Process specific configuration
withName: checkDesignFile {
......@@ -44,6 +45,10 @@ process {
module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'UCSC_userApps/v317', 'bedtools/2.26.0', 'phantompeakqualtools/1.2']
queue = '128GB,256GB,256GBv1'
}
withName: plotProfile {
module = ['deeptools/2.5.0.1']
cpus = 32
}
withName: consensusPeaks {
module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0']
executor = 'local'
......@@ -61,7 +66,7 @@ process {
cpus = 32
}
withName: multiqcReport {
module = ['python/3.6.1-2-anaconda', 'pandoc/2.7', 'multiqc/1.7']
module = ['python/3.6.1-2-anaconda', 'pandoc/2.7', 'singularity/3.0.2']
executor = 'local'
}
}
......@@ -70,22 +75,28 @@ params {
// Reference file paths on BioHPC
genomes {
'GRCh38' {
bwa = '/project/shared/bicf_workflow_ref/GRCh38'
bwa = '/project/shared/bicf_workflow_ref/human/GRCh38'
genomesize = 'hs'
chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCh38/genome.fa'
chromsizes = '/project/shared/bicf_workflow_ref/human/GRCh38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/human/GRCh38/genome.fa'
gtf = '/project/shared/bicf_workflow_ref/human/GRCh38/gencode.v25.chr_patch_hapl_scaff.annotation.gtf'
geneNames = '/project/shared/bicf_workflow_ref/human/GRCh38/genenames.txt'
}
'GRCh37' {
bwa = '/project/shared/bicf_workflow_ref/GRCh37'
bwa = '/project/shared/bicf_workflow_ref/human/GRCh37'
genomesize = 'hs'
chromsizes = '/project/shared/bicf_workflow_ref/GRCh37/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCh37/genome.fa'
chromsizes = '/project/shared/bicf_workflow_ref/human/GRCh37/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/human/GRCh37/genome.fa'
gtf = '/project/shared/bicf_workflow_ref/human/GRCh37/gencode.v19.chr_patch_hapl_scaff.annotation.gtf'
geneNames = '/project/shared/bicf_workflow_ref/human/GRCh37/genenames.txt'
}
'GRCm38' {
bwa = '/project/shared/bicf_workflow_ref/GRCm38'
bwa = '/project/shared/bicf_workflow_ref/mouse/GRCm38'
genomesize = 'mm'
chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCm38/genome.fa'
chromsizes = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genome.fa'
gtf = '/project/shared/bicf_workflow_ref/mouse/GRCm38/gencode.vM20.annotation.gtf'
geneNames = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genenames.txt'
}
}
}
......
# Custom Logo
custom_logo: 'bicf_logo.png'
custom_logo_url: 'https://www.utsouthwestern.edu/labs/bioinformatics/'
custom_logo_title: 'Bioinformatics Core Facility'
report_header_info:
- Contact E-mail: 'bicf@utsouthwestern.edu'
- Application Type: 'ChIP-seq'
- Department: 'Bioinformatic Core Facility, Department of Bioinformatics'
- Contributors and Licensing: 'See https://doi.org/10.5281/zenodo.2648844'
# Title to use for the report.
title: BICF ChIP-seq Analysis Report
......
#!/usr/bin/env nextflow
/*
BICF ChIP-seq Analysis Workflow
#### Homepage / Documentation
https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/
Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
*/
// Path to an input file, or a pattern for multiple inputs
// Note - $baseDir is the location of this workflow file main.nf
......@@ -15,26 +25,46 @@ params.topPeakCount = 600
params.astrocyte = false
params.skipDiff = false
params.skipMotif = false
params.skipPlotProfile = false
params.references = "$baseDir/../docs/references.md"
params.multiqc = "$baseDir/conf/multiqc_config.yaml"
params.ci = false
params.dev = false
// Assign variables if astrocyte
if (params.astrocyte) {
print("Running under astrocyte")
referenceLocation = "/project/shared/bicf_workflow_ref"
params.bwaIndex = "$referenceLocation/$params.genome"
params.chromSizes = "$referenceLocation/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/$params.genome/genome.fa.txt"
if (params.genome == 'GRCh37' || params.genome == 'GRCh38') {
if (params.genome == 'GRCh37') {
params.bwaIndex = "$referenceLocation/human/$params.genome"
params.chromSizes = "$referenceLocation/human/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/human/$params.genome/genome.fa"
params.gtf = "$referenceLocation/human/$params.genome/gencode.v19.chr_patch_hapl_scaff.annotation.gtf"
params.geneNames = "$referenceLocation/human/$params.genome/genenames.txt"
params.genomeSize = 'hs'
} else if (params.genome == 'GRCm38') {
params.bwaIndex = "$referenceLocation/mouse/$params.genome"
params.chromSizes = "$referenceLocation/mouse/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/mouse/$params.genome/genome.fa"
params.gtf = "$referenceLocation/mouse/$params.genome/gencode.vM20.annotation.gtf"
params.geneNames = "$referenceLocation/mouse/$params.genome/genenames.txt"
params.genomeSize = 'mm'
} else if (params.genome == 'GRCh38') {
params.bwaIndex = "$referenceLocation/human/$params.genome"
params.chromSizes = "$referenceLocation/human/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/human/$params.genome/genome.fa"
params.gtf = "$referenceLocation/human/$params.genome/gencode.v25.chr_patch_hapl_scaff.annotation.gtf"
params.geneNames = "$referenceLocation/human/$params.genome/genenames.txt"
params.genomeSize = 'hs'
}
} else {
params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false
params.geneNames = params.genome ? params.genomes[ params.genome ].geneNames ?: false : false
}
......@@ -68,8 +98,36 @@ extendReadsLen = params.extendReadsLen
topPeakCount = params.topPeakCount
skipDiff = params.skipDiff
skipMotif = params.skipMotif
skipPlotProfile = params.skipPlotProfile
references = params.references
multiqc = params.multiqc
gtfFile = params.gtf
geneNames = params.geneNames
/*
* trackStart: track start of pipeline
*/
process trackStart {
script:
"""
hostname
ulimit -a
curl -H 'Content-Type: application/json' -X PUT -d '{ \
"sessionId": "${workflow.sessionId}", \
"pipeline": "chipseq_analysis", \
"start": "${workflow.start}", \
"astrocyte": ${params.astrocyte}, \
"status": "started", \
"nextflowVersion": "${workflow.nextflow.version}", \
"pipelineVersion": "1.1.2", \
"ci": ${params.ci}, \
"dev": ${params.dev}}' \
"https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking"
"""
}
// Check design file for errors
process checkDesignFile {
......@@ -414,6 +472,7 @@ process callPeaksMACS {
set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks
file '*.xls' into callPeaksMACSsummit
file('version_*.txt') into callPeaksMACSVersions
file("*.fc_signal.bw") into bigwigs
script:
......@@ -446,6 +505,29 @@ peaksDesign = experimentPeaks
"$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
.collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$outDir/design")
//plotProfile
process plotProfile {
publishDir "$outDir/experimentQC", mode: 'copy'
input:
file bigWigList from bigwigs.collect()
output:
file '*.{png,gz}' into plotProfile
when:
!skipPlotProfile
script:
"""
module load deeptools/2.5.0.1
bash $baseDir/scripts/plot_profile.sh -g $gtfFile
"""
}
// Calculate Consensus Peaks
process consensusPeaks {
......@@ -494,7 +576,7 @@ process peakAnnotation {
"""
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $genome
Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $gtfFile $geneNames
"""
}
......@@ -565,7 +647,6 @@ process diffPeaks {
// Generate Multiqc Report, gerernate Software Versions and references
process multiqcReport {
publishDir "$outDir/${task.process}", mode: 'copy'
input:
......@@ -598,11 +679,12 @@ process multiqcReport {
"""
module load python/3.6.1-2-anaconda
module load pandoc/2.7
module load multiqc/1.7
module load singularity/3.0.2
echo $workflow.nextflow.version > version_nextflow.txt
multiqc --version > version_multiqc.txt
singularity exec /project/shared/bicf_workflow_ref/singularity_images/bicf-multiqc-2.0.0.img multiqc --version > version_multiqc.txt
python --version &> version_python.txt
python3 $baseDir/scripts/generate_references.py -r $references -o software_references
python3 $baseDir/scripts/generate_versions.py -o software_versions
multiqc -c $multiqc .
singularity exec /project/shared/bicf_workflow_ref/singularity_images/bicf-multiqc-2.0.0.img multiqc -c $multiqc .
"""
}
......@@ -4,11 +4,28 @@ profiles {
}
}
trace {
enabled = true
file = 'pipeline_trace.txt'
fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
}
timeline {
enabled = true
file = 'timeline.html'
}
report {
enabled = true
file = 'report.html'
}
manifest {
name = 'chipseq_analysis'
description = 'BICF ChIP-seq Analysis Workflow.'
homePage = 'https://github.com/nf-core/rnaseq'
version = '1.0.0'
homePage = 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis'
version = '1.1.2'
mainScript = 'main.nf'
nextflowVersion = '>=0.31.0'
}
#!/bin/Rscript
# Load libraries
library("ChIPseeker")
# Currently mouse or human
#*
#* --------------------------------------------------------------------------
#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
#* --------------------------------------------------------------------------
#*
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Mmusculus.UCSC.mm10.knownGene")
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
library("org.Hs.eg.db")
library("org.Mm.eg.db")
#Currently Human or Mouse
# Load libraries
library("ChIPseeker")
library(GenomicFeatures)
# Create parser object
args <- commandArgs(trailingOnly=TRUE)
# Check input args
if (length(args) != 2) {
stop("Usage: annotate_peaks.R annotate_design.tsv genome_assembly", call.=FALSE)
if (length(args) != 3) {
stop("Usage: annotate_peaks.R annotate_design.tsv gtf geneNames", call.=FALSE)
}
design_file <- args[1]
genome_assembly <- args[2]
gtf <- args[2]
geneNames <- args[3]
# Load UCSC Known Genes
if(genome_assembly=='GRCh37') {
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
annodb <- 'org.Hs.eg.db'
} else if(genome_assembly=='GRCm38') {
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
annodb <- 'org.Mm.eg.db'
} else if(genome_assembly=='GRCh38') {
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
annodb <- 'org.Hs.eg.db'
}
txdb <- makeTxDbFromGFF(gtf)
sym <- read.table(geneNames, header=T, sep='\t') [,4:5]
# Output version of ChIPseeker
chipseeker_version = packageVersion('ChIPseeker')
......@@ -48,18 +41,19 @@ names(files) <- design$Condition
# Granges of files
peaks <- lapply(files, readPeakFile, as = "GRanges", header = FALSE)
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, annoDb=annodb, tssRegion=c(-3000, 3000), verbose=FALSE)
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
column_names <- c("chr", "start", "end", "width", "strand_1", "name", "score", "strand", "signalValue",
column_names <- c("geneId","chr", "start", "end", "width", "strand_1", "name", "score", "strand", "signalValue",
"pValue", "qValue", "peak", "annotation", "geneChr", "geneStart", "geneEnd",
"geneLength" ,"geneStrand", "geneId", "transcriptId", "distanceToTSS",
"ENSEMBL", "symbol", "geneName")
"geneLength" ,"geneStrand", "transcriptId", "distanceToTSS", "symbol")
for(index in c(1:length(peakAnnoList))) {
filename <- paste(names(peaks)[index], ".chipseeker_annotation.tsv", sep="")
df <- as.data.frame(peakAnnoList[[index]])
colnames(df) <- column_names
write.table(df[ , !(names(df) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F)
df$geneId <- sapply(strsplit(as.character(df$geneId), split = "\\."), "[[", 1)
df_final <- merge(df, sym, by.x="geneId", by.y="ensembl", all.x=T)
colnames(df_final) <- column_names
write.table(df_final[ , !(names(df_final) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F)
# Draw individual plots
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Generate peaks from data.'''
import os
......@@ -132,8 +138,20 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
with open(xcor, 'r') as xcor_fh:
firstline = xcor_fh.readline()
frag_lengths = firstline.split()[2] # third column
fragment_length = frag_lengths.split(',')[0] # grab first value
logger.info("Fraglen %s", fragment_length)
frag_lengths_array = frag_lengths.split(',')
fragment_length = 0
fragment = False
# Loop through all values of fragment length
for f in frag_lengths.split(','):
fragment_length = f
logger.info("Fraglen %s", fragment_length)
if int(fragment_length) > 50:
fragment = True
break
if fragment == False:
logger.info('Error in cross-correlation analysis: %s', frag_lengths_array)
raise Exception("Error in cross-correlation analysis: %s" % frag_lengths_array)
# Generate narrow peaks and preliminary signal tracks
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Check if design file is correctly formatted and matches files list.'''
import argparse
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Convert tagAlign files for further processing.'''
import os
......
#!/bin/Rscript
#*
#* --------------------------------------------------------------------------
#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
#* --------------------------------------------------------------------------
#*
# Load libraries
library("DiffBind")
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Generate experiment design files for downstream processing.'''
import argparse
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Experiment correlation and enrichment of reads over genome-wide bins.'''
......
#!/usr/bin/env python
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Make header for HTML of references.'''
import argparse
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Make YAML of software versions.'''
from __future__ import print_function
......@@ -40,6 +46,7 @@ SOFTWARE_REGEX = {
'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"],
'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""],
'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"],
'Python': ['version_python.txt', r"Python (\S+)"],
'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"],
}
......@@ -102,6 +109,7 @@ def main():
results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>'
results['deepTools'] = '<span style="color:#999999;\">Not Run</span>'
results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>'
results['Python'] = '<span style="color:#999999;\">Not Run</span>'
# list all files
files = glob.glob('**/*.txt', recursive=True)
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Remove duplicates and filter unmapped reads.'''
import os
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Align reads to reference genome.'''
import os
......@@ -212,6 +218,13 @@ def main():
shlex.split("samtools flagstat %s" % (bam_filename)),
stdout=temp_file)
#Genome/Bad fastq File Check
file_check = open(bam_mapstats_filename).readlines()
percent = file_check[4].split('(')[1]
percent = percent.split('%')[0]
if float(percent) < 10:
raise Exception ('Mapped Genes too low: Check for correct Genotype')
# Remove sai files
for sai_file in sai:
os.remove(sai_file)
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Call Motifs on called peaks.'''
import os
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Generate naive overlap peak files and design file for downstream processing.'''
import os
......
#!/bin/bash
#plot_profile.sh
script_name="plot_profile.sh"
#Help function
usage() {
echo "-h --Help documentation for $script_name"
echo "-g --File path to gtf/bed files"
echo "Example: $script_name -g 'genome.gtf'"
exit 1
}
raise()
{
echo "${1}" >&2
}
check_tools() {
raise "
Checking for required libraries and components on this system
"
deeptools --version &> version_deeptools.txt
if [ $? -gt 0 ]
then
raise "Missing deeptools"
return 1
fi
}
compute_matrix() {
raise "
Computing matrix on ${1} using ${2}
"
computeMatrix reference-point \
--referencePoint TSS \
-S ${1} \
-R ${2} \
--skipZeros \
-o computeMatrix.gz \
-p max/2
if [ $? -gt 0 ]
then
raise "Problem building matrix"
return 1
fi
}
plot_profile() {
raise "
Plotting profile
"
plotProfile -m computeMatrix.gz \
-out plotProfile.png
if [ $? -gt 0 ]
then
raise "Problem plotting"
return 1
fi
}
run_main() {
# Parsing options
OPTIND=1 # Reset OPTIND
while getopts :g:h opt
do
case $opt in
g) gtf=$OPTARG;;
h) usage;;
esac
done
shift $(($OPTIND -1))
# Check for mandatory options
if [[ -z $gtf ]]; then
usage
fi
bws=$(ls *pooled.fc_signal.bw)
check_tools || exit 1
compute_matrix "${bws}" "${gtf}" || return 1
plot_profile || return 1
raise "ALL COMPLETE"
}
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]
then
run_main "$@"
if [ $? -gt 0 ]
then
exit 1
fi
fi
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Generate pooled and pseudoreplicate from data.'''
import argparse
......@@ -166,26 +172,7 @@ def self_psuedoreplication(tag_file, prefix, paired):
return pseudoreplicate_dict
def main():
args = get_args()
paired = args.paired
design = args.design
cutoff_ratio = args.cutoff
# Create a file handler
handler = logging.FileHandler('experiment_generation.log')
logger.addHandler(handler)
# Read files as dataframes
design_df = pd.read_csv(design, sep='\t')
# Get current directory to build paths
cwd = os.getcwd()
# Check Number of replicates and replicates
no_reps = check_replicates(design_df)
no_unique_controls = check_controls(design_df)
def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls):
if no_reps == 1:
logger.info("No other replicate specified "
"so processing as an unreplicated experiment.")
......@@ -217,134 +204,123 @@ def main():
pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control")
pool_control = pool_control_tmp
# Psuedoreplicate and update design accordingly
if not replicated:
# Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data
experiment_id = design_df.at[0, 'experiment_id']
replicate = design_df.at[0, 'replicate']
design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index()
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
design_new_df['replicate'] = design_new_df['replicate'].astype(str)
design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1'
design_new_df.at[1, 'replicate'] = '1_pr'
design_new_df.at[1, 'xcor'] = 'Calculate'
design_new_df.at[2, 'sample_id'] = experiment_id + '_pr2'
design_new_df.at[2, 'replicate'] = '2_pr'
design_new_df.at[2, 'xcor'] = 'Calculate'
design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled'
design_new_df.at[3, 'replicate'] = 'pooled'
design_new_df.at[3, 'xcor'] = 'Calculate'
design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align']
# Make 2 self psuedoreplicates
self_pseudoreplicates_dict = {}
for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
replicate_prefix = experiment_id + '_' + str(rep)
self_pseudoreplicates_dict = \
self_psuedoreplication(tag_file, replicate_prefix, paired)
# Update design to include new self pseudo replicates
for rep, pseudorep_file in self_pseudoreplicates_dict.items():
path_to_file = cwd + '/' + pseudorep_file
replicate = rep + 1
design_new_df.loc[replicate, 'tag_align'] = path_to_file
# Drop index column
design_new_df.drop(labels='index', axis=1, inplace=True)
# Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data
experiment_id = design_df.at[0, 'experiment_id']
replicate_files = design_df.tag_align.unique()
pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired)
# Make 2 self psuedoreplicates
pseudoreplicates_dict = {}
for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
replicate_prefix = experiment_id + '_' + str(rep)
pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired)
pseudoreplicates_dict[rep] = pr_dict
# Update design to include new self pseudo replicates
pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict)
pool_pseudoreplicates_dict = {}
for index, row in pseudoreplicates_df.iterrows():
replicate_id = index + 1
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
pool_replicate = pool(row, pr_filename, False)
pool_pseudoreplicates_dict[replicate_id] = pool_replicate
design_new_df = design_df #.loc[np.repeat(design_df.index, 4)].reset_index()
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
# If paired change to single End
if paired:
pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled")
else:
# Make pool of replicates
replicate_files = design_df.tag_align.unique()
experiment_id = design_df.at[0, 'experiment_id']
pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired)
pool_experiment_se = pool_experiment
# If paired change to single End
if paired:
pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled")
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
if not single_control:
path_to_pool_control = cwd + '/' + pool_control
if control_df.values.max() > cutoff_ratio:
logger.info("Number of reads in controls differ by " +
" > factor of %f. Using pooled controls." % (cutoff_ratio))
design_new_df['control_tag_align'] = path_to_pool_control
else:
pool_experiment_se = pool_experiment
# Make self psuedoreplicates equivalent to number of replicates
pseudoreplicates_dict = {}
for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
replicate_prefix = experiment_id + '_' + str(rep)
pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired)
pseudoreplicates_dict[rep] = pr_dict
# Merge self psuedoreplication for each true replicate
pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict)
pool_pseudoreplicates_dict = {}
for index, row in pseudoreplicates_df.iterrows():
replicate_id = index + 1
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
pool_replicate = pool(row, pr_filename, False)
pool_pseudoreplicates_dict[replicate_id] = pool_replicate
design_new_df = design_df
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
if not single_control:
path_to_pool_control = cwd + '/' + pool_control
if control_df.values.max() > cutoff_ratio:
logger.info("Number of reads in controls differ by " +
" > factor of %f. Using pooled controls." % (cutoff_ratio))
design_new_df['control_tag_align'] = path_to_pool_control
else:
for index, row in design_new_df.iterrows():
exp_no_reads = utils.count_lines(row['tag_align'])
con_no_reads = utils.count_lines(row['control_tag_align'])
if con_no_reads < exp_no_reads:
logger.info("Fewer reads in control than experiment." +
"Using pooled controls for replicate %s."
% row['replicate'])
for index, row in design_new_df.iterrows():
exp_no_reads = utils.count_lines(row['tag_align'])
con_no_reads = utils.count_lines(row['control_tag_align'])
if con_no_reads < exp_no_reads:
logger.info("Fewer reads in control than experiment." +
"Using pooled controls for replicate %s."
% row['replicate'])
design_new_df.loc[index, 'control_tag_align'] = \
path_to_pool_control
else:
if paired:
control = row['control_tag_align']
control_basename = os.path.basename(
utils.strip_extensions(control, STRIP_EXTENSIONS))
control_tmp = bedpe_to_tagalign(control, control_basename)
path_to_control = cwd + '/' + control_tmp
design_new_df.loc[index, 'control_tag_align'] = \
path_to_pool_control
else:
if paired:
control = row['control_tag_align']
control_basename = os.path.basename(
utils.strip_extensions(control, STRIP_EXTENSIONS))
control_tmp = bedpe_to_tagalign(control, control_basename)
path_to_control = cwd + '/' + control_tmp
design_new_df.loc[index, 'control_tag_align'] = \
path_to_control
path_to_control
else:
if paired:
path_to_pool_control = cwd + '/' + pool_control
else:
path_to_pool_control = cwd + '/' + pool_control
design_new_df['control_tag_align'] = path_to_pool_control
# Add in pseudo replicates
tmp_metadata = design_new_df.loc[0].copy()
tmp_metadata['control_tag_align'] = path_to_pool_control
for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
tmp_metadata['replicate'] = str(rep) + '_pr'
tmp_metadata['xcor'] = 'Calculate'
path_to_file = cwd + '/' + pseudorep_file
tmp_metadata['tag_align'] = path_to_file
design_new_df = design_new_df.append(tmp_metadata)
# Add in pool experiment
tmp_metadata['sample_id'] = experiment_id + '_pooled'
tmp_metadata['replicate'] = 'pooled'
path_to_pool_control = pool_control
design_new_df['control_tag_align'] = path_to_pool_control
# Add in pseudo replicates
tmp_metadata = design_new_df.loc[0].copy()
tmp_metadata['control_tag_align'] = path_to_pool_control
for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
tmp_metadata['replicate'] = str(rep) + '_pr'
tmp_metadata['xcor'] = 'Calculate'
path_to_file = cwd + '/' + pool_experiment_se
path_to_file = cwd + '/' + pseudorep_file
tmp_metadata['tag_align'] = path_to_file
design_new_df = design_new_df.append(tmp_metadata)
# Add in pool experiment
tmp_metadata['sample_id'] = experiment_id + '_pooled'
tmp_metadata['replicate'] = 'pooled'
tmp_metadata['xcor'] = 'Calculate'
path_to_file = cwd + '/' + pool_experiment_se
tmp_metadata['tag_align'] = path_to_file
design_new_df = design_new_df.append(tmp_metadata)
return design_new_df
def main():
args = get_args()
paired = args.paired
design = args.design
cutoff_ratio = args.cutoff
# Create a file handler
handler = logging.FileHandler('experiment_generation.log')
logger.addHandler(handler)
# Read files as dataframes
design_df = pd.read_csv(design, sep='\t')
# Get current directory to build paths
cwd = os.getcwd()
# Check Number of replicates and replicates
no_reps = check_replicates(design_df)
no_unique_controls = check_controls(design_df)
# Generate new design file
design_new_df = generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls)
# Write out new dataframe
experiment_id = design_df.at[0, 'experiment_id']
design_new_df.to_csv(experiment_id + '_ppr.tsv',
header=True, sep='\t', index=False)
......