Skip to content
Snippets Groups Projects
Commit e6657f34 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '14-FixTrimReads' into 'master'

Resolve "Fix Trim Reads"

Closes #14

See merge request !7
parents 9c3eebed 4a6eb2b8
Branches
Tags
1 merge request!7Resolve "Fix Trim Reads"
Pipeline #5262 passed with stages
in 14 hours, 6 minutes, and 56 seconds
......@@ -28,6 +28,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
##### 1) Fastq Files
+ You will need the full path to the files for the Bash Scipt
## Design file
+ The Design file is a tab-delimited file with 4 columns for Single-End and 5 columns for Paired-End. Letter, numbers, and underlines can be used in the names. However, the names must begin with a letter. Columns must be as follows:
......@@ -41,34 +42,28 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
+ See [HERE](/docs/design_ENCSR265ZXX_SE.txt) for an example design file, single-end
## SOP
This SOP describes the analysis pipeline of downstream analysis of ChIP-seq sequencing data. This pipeline includes (1) Quality control using Deeptools, (2) Peak annotation, (3) Differential peak analysis, and (4) motif analysis. BAM files and SORTED peak BED files selected as input. For each sample this workflow:
1) Annotate all peaks using ChipSeeker
2) Qulity control and signal profiling with Deeptools
3) Find differential expressed peaks using DiffBind
4) Annotate all differentially expressed peaks
5) Using MEME-ChIP in motif finding for both original peaks and differently expressed peaks
## Annotations used in the pipeline
## Pipeline
+ There are ??? steps to the pipeline
1. Check input files
2. Trim adaptors with TrimGalore!
ChipSeeker - Known gene from Bioconductor [TxDb annotation](https://bioconductor.org/packages/release/BiocViews.html#___TxDb)
Deeptools - RefGene downloaded from UCSC Table browser
## Workflow Parameters
## Output Files
Folder | File | Description
--- | --- | ---
design | N/A | Inputs used for analysis; can ignore
trimReads | *_trimming_report.txt | report detailing how many reads were trimmed
trimReads | *_trimmed.fq.gz | trimmed fastq files used for analysis
bam - Choose all ChIP-seq alignment files for analysis.
genome - Choose a genomic reference (genome).
peaks - Choose all the peak files for analysis. All peaks should be sorted by the user
design - Choose the file with the experiment design information. CSV format
toppeak - The number of top peaks used for motif analysis. Default is all
## Common Errors
If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know and we will add it here.
## Citation
Please cite individual programs and versions used [HERE](docs/references.md), and the pipeline doi: coming soon. Please cite in publications: Pipeline was developed by BICF from funding provided by Cancer Prevention and Research Institute of Texas (RP150596).
### Credits
This example worklow is derived from original scripts kindly contributed by the Bioinformatic Core Facility ([BICF](https://www.utsouthwestern.edu/labs/bioinformatics/)), in the [Department of Bioinformatics](https://www.utsouthwestern.edu/departments/bioinformatics/).
......@@ -89,40 +89,57 @@ process checkDesignFile {
if (pairedEnd) {
rawReads = designFilePaths
.splitCsv(sep: '\t', header: true)
.map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.replicate ] }
} else {
.map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.replicate, row.fq_length ] }
}
else {
rawReads = designFilePaths
.splitCsv(sep: '\t', header: true)
.map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.replicate ] }
.map { row -> [ row.sample_id, [row.fastq_read1], row.experiment_id, row.replicate, row.fq_length ] }
}
// Trim raw reads using trimgalore
process trimReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
tag "${sampleId}-${replicate}"
publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'
input:
set sampleId, reads, experimentId, replicate from rawReads
set sampleId, reads, experimentId, replicate, fqLength from rawReads
output:
set sampleId, file('*.fq.gz'), experimentId, replicate into trimmedReads
file('*trimming_report.txt') into trimgalore_results
set sampleId, file('*.fq.gz'), experimentId, replicate, fqLength into trimmedReads
file('*trimming_report.txt') into trimgaloreResults
file('version_*.txt') into trimReadsVersions
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
"""
}
else {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
"""
}
if (params.astrocyte == true) {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load trimgalore/0.4.1
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -s ${sampleId} -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load trimgalore/0.4.1
python3 ${baseDir}/scripts/trim_reads.py -f ${reads[0]} -s ${sampleId}
"""
}
}
else {
if (pairedEnd) {
"""
python3 ${baseDir}/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -s ${sampleId} -p
"""
}
else {
"""
python3 ${baseDir}/scripts/trim_reads.py -f ${reads[0]} -s ${sampleId}
"""
}
}
}
......
......@@ -5,6 +5,7 @@
import subprocess
import argparse
import shutil
import os
import logging
EPILOG = '''
......@@ -32,6 +33,10 @@ def get_args():
nargs='+',
required=True)
parser.add_argument('-s', '--sample',
help="The name of the sample.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
......@@ -49,6 +54,15 @@ def check_tools():
trimgalore_path = shutil.which("trim_galore")
if trimgalore_path:
logger.info('Found trimgalore: %s', trimgalore_path)
# Get Version
trim_version_command = "trim_galore --version"
trimgalore_version = subprocess.check_output(trim_version_command, shell=True)
# Write to file
trimgalore_file = open("version_trimgalore.txt", "wb")
trimgalore_file.write(trimgalore_version)
trimgalore_file.close()
else:
logger.error('Missing trimgalore')
raise Exception('Missing trimgalore')
......@@ -56,11 +70,46 @@ def check_tools():
cutadapt_path = shutil.which("cutadapt")
if cutadapt_path:
logger.info('Found cutadapt: %s', cutadapt_path)
# Get Version
cutadapt_version_command = "cutadapt --version"
cutadapt_version = subprocess.check_output(cutadapt_version_command, shell=True)
# Write to file
cutadapt_file = open("version_cutadapt.txt", "wb")
cutadapt_file.write(b"Version %s" % (cutadapt_version))
cutadapt_file.close()
else:
logger.error('Missing cutadapt')
raise Exception('Missing cutadapt')
def rename_reads(fastq, sample, paired):
'''Rename fastq files by sample name.'''
# Get current directory to build paths
cwd = os.getcwd()
renamed_fastq = []
if paired: # paired-end data
# Set file names
renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz')
renamed_fastq.append(cwd + '/' + sample + '_R2.fastq.gz')
# Great symbolic links
os.symlink(fastq[0], renamed_fastq[0])
os.symlink(fastq[1], renamed_fastq[1])
else:
# Set file names
renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz')
# Great symbolic links
os.symlink(fastq[0], renamed_fastq[0])
return renamed_fastq
def trim_reads(fastq, paired):
'''Run trim_galore on 1 or 2 files.'''
......@@ -82,6 +131,7 @@ def trim_reads(fastq, paired):
def main():
args = get_args()
fastq = args.fastq
sample = args.sample
paired = args.paired
# Create a file handler
......@@ -91,8 +141,11 @@ def main():
# Check if tools are present
check_tools()
# Rename fastq files by sample
fastq_rename = rename_reads(fastq, sample, paired)
# Run trim_reads
trim_reads(fastq, paired)
trim_reads(fastq_rename, paired)
if __name__ == '__main__':
......
......@@ -10,19 +10,39 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
@pytest.mark.integration
def test_trim_reads_singleend():
#raw_fastq = test_data_path + 'ENCFF833BLU.fastq.gz'
#trimmed_fastq = test_output_path + 'ENCFF833BLU_trimmed.fq.gz'
#trimmed_fastq_report = test_output_path + \
# 'ENCFF833BLU.fastq.gz_trimming_report.txt'
#assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq)
#assert os.path.getsize(trimmed_fastq) == 2512853101
#assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4]
pass
@pytest.mark.integration
def test_trim_reads_pairedend():
# Do the same thing for paired end data
pass
@pytest.mark.pairedend_mouse
def test_trim_reads_pairedend_mouse():
raw_fastq1 = test_data_path + 'ENCFF913PMS.fastq.gz'
trimmed_fastq1 = test_output_path + 'ENCLB122XDP/ENCLB122XDP_R1_val_1.fq.gz'
raw_fastq2 = test_data_path + 'ENCFF999SZR.fastq.gz'
trimmed_fastq2 = test_output_path + 'ENCLB749GLW/ENCLB749GLW_R2_val_2.fq.gz'
assert os.path.getsize(raw_fastq1) != os.path.getsize(trimmed_fastq1)
assert os.path.getsize(trimmed_fastq1) == 934738906
assert os.path.getsize(raw_fastq2) != os.path.getsize(trimmed_fastq2)
assert os.path.getsize(trimmed_fastq2) == 1297020342
trimmed_fastq_report1 = test_output_path + \
'ENCLB122XDP/ENCLB122XDP_R1.fastq.gz_trimming_report.txt'
assert 'Trimming mode: paired-end' in open(trimmed_fastq_report1).readlines()[4]
trimmed_fastq_report2 = test_output_path + \
'ENCLB749GLW/ENCLB749GLW_R1.fastq.gz_trimming_report.txt'
assert 'Trimming mode: paired-end' in open(trimmed_fastq_report2).readlines()[4]
@pytest.mark.singleend_human
def test_trim_reads_singleend_human():
raw_fastq = test_data_path + 'ENCFF115PAE.fastq.gz'
trimmed_fastq = test_output_path + 'ENCLB170KFO/ENCLB170KFO_R1_trimmed.fq.gz'
assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq)
assert os.path.getsize(trimmed_fastq) == 2171147098
trimmed_fastq_report = test_output_path + \
'ENCLB969KTX/ENCLB969KTX_R1.fastq.gz_trimming_report.txt'
assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4]
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment