Skip to content
Snippets Groups Projects
Commit f08db8a9 authored by Holly Ruess's avatar Holly Ruess
Browse files

Fix convert reads

parent ec9f9468
Branches
Tags
1 merge request!11Resolve "Fix Convert Reads"
Pipeline #5315 passed with stages
in 6 hours, 38 minutes, and 35 seconds
......@@ -7,6 +7,8 @@ All notable changes to this project will be documented in this file.
- Removed biosample, factor, treatment from design file
- Updated documentation
- Output graphics are in pdf format
- Not optional to remove chrM while converting to tagAlign
- Not optional to do a tn5 shift on tagAlign files
### Added
- Changelog
......
......@@ -49,6 +49,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
3. Map reads with BWA, filter with SamTools, and sort with Sambamba
4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
5. QC metrics with deep tools
6. Convert bam files to tagAlign files; remove chrM and adds tn5 shift
## Output Files
......@@ -64,8 +65,8 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
filterReads | *.dedup.bam.bai | indexed filtered bam file
filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
filterReads | *.dedup.pbc.qc | QC metrics of library complexity
convertReads | *.tagAlign.gz | bed alignent in BEDPE or BEDSE format
experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
experimentQC | *_fingerprint.pdf | plot to determine if the antibody-treatment enriched sufficiently
experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples
experimentQC | sample_mbs.npz | array of multiple BAM summaries
......
......@@ -6,8 +6,6 @@
// Define Input variables
params.reads = "$baseDir/../test_data/*.fastq.gz"
params.pairedEnd = false
params.chrM = true
params.tn5 = true
params.designFile = "$baseDir/../test_data/design_ENCSR265ZXX_SE.txt"
params.genome = 'GRCh38'
params.genomes = []
......@@ -36,8 +34,6 @@ readsList = Channel
// Define regular variables
pairedEnd = params.pairedEnd
chrM = params.chrM
tn5 = params.tn5
designFile = params.designFile
genomeSize = params.genomeSize
chromSizes = params.chromSizes
......@@ -286,64 +282,47 @@ process experimentQC {
// Convert reads to bam
process convertReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
tag "${sampleId}-${replicate}"
publishDir "${outDir}/${task.process}", mode: 'copy'
input:
set sampleId, deduped, bai, experimentId, replicate from convertReads
set sampleId, deduped, bai, experimentId, replicate from convertReads
output:
set sampleId, file('*.tagAlign.gz'), experimentId, replicate into tagReads
set sampleId, file('*.tagAlign.gz'), experimentId, replicate into tagReads
file('version_*.txt') into convertReadsVersions
script:
if (pairedEnd) {
if (tn5 && chrM){
"""
python3 $baseDir/scripts/convert_reads.py -b $deduped -p -m -t
"""
}
else if (tn5){
"""
python3 $baseDir/scripts/convert_reads.py -b $deduped -p -t
"""
}
else if (chrM){
"""
python3 $baseDir/scripts/convert_reads.py -b $deduped -p -m
"""
}
else{
"""
python3 $baseDir/scripts/convert_reads.py -b $deduped -p
"""
}
}
else {
if (tn5 && chrM){
"""
python3 $baseDir/scripts/convert_reads.py -b $deduped -m -t
"""
}
else if (tn5){
"""
python3 $baseDir/scripts/convert_reads.py -b $deduped -t
"""
}
else if (chrM){
"""
python3 $baseDir/scripts/convert_reads.py -b $deduped -m
"""
if (params.astrocyte == true) {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.4.1
module load bedtools/2.26.0
python3 ${baseDir}/scripts/convert_reads.py -b ${deduped} -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.4.1
module load bedtools/2.26.0
python3 ${baseDir}/scripts/convert_reads.py -b ${deduped}
"""
}
}
else{
"""
python3 $baseDir/scripts/convert_reads.py -b $deduped
"""
else {
if (pairedEnd) {
"""
python3 ${baseDir}/scripts/convert_reads.py -b ${deduped} -p
"""
}
else {
"""
python3 ${baseDir}/scripts/convert_reads.py -b ${deduped}
"""
}
}
}
}
......
......@@ -40,16 +40,6 @@ def get_args():
default=False,
action='store_true')
parser.add_argument('-m', '--chrm',
help="True/False if to remove ChrM.",
default=False,
action='store_true')
parser.add_argument('-t', '--tn5',
help='Enable TN5 shifting for ATAC-Seq.',
default=False,
action='store_true')
args = parser.parse_args()
return args
......@@ -64,6 +54,15 @@ def check_tools():
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
......@@ -71,31 +70,34 @@ def check_tools():
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
def convert_mapped(bam, tag_filename, chrm):
def convert_mapped(bam, tag_filename):
'''Use bedtools to convert to tagAlign.'''
steps = [
out, err = utils.run_pipe([
"bamToBed -i %s" % (bam),
r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]
if chrm:
steps.extend(["grep -v 'chrM'"])
steps.extend(["gzip -nc"])
out, err = utils.run_pipe(steps, outfile=tag_filename)
r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
"grep -v 'chrM'",
r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""",
"gzip -nc"], outfile=tag_filename)
def convert_mapped_pe(bam, bam_basename, tag_filename, chrm):
def convert_mapped_pe(bam, bam_basename, tag_filename):
'''Use bedtools to convert to tagAlign PE data.'''
bedpe_filename = bam_basename + ".bedpe.gz"
# Name sort bam to make BEDPE
nmsrt_bam_filename = bam_basename + ".nmsrt.bam"
samtools_sort_command = \
......@@ -105,42 +107,22 @@ def convert_mapped_pe(bam, bam_basename, tag_filename, chrm):
logger.info(samtools_sort_command)
subprocess.check_output(shlex.split(samtools_sort_command))
steps = ["bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename)]
steps.extend(["gzip -nc"])
# Steps: 1) bamToBed, 2) remove mito,
# 3)make paired reads on separate lines, 4)tn5 shift, 5)compress
out, err = utils.run_pipe([
"bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename),
"grep -v 'chrM'",
r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'""",
r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""",
"gzip -nc"], outfile=tag_filename)
out, err = utils.run_pipe(steps, outfile=bedpe_filename)
os.remove(nmsrt_bam_filename)
if chrm:
steps.extend(["grep -v 'chrM'"])
# Convert read pairs to reads into standard tagAlign file
tag_steps = ["zcat -f %s" % (bedpe_filename)]
tag_steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
if chrm:
tag_steps.extend(["grep -v 'chrM'"])
tag_steps.extend(['gzip -cn'])
out, err = utils.run_pipe(tag_steps, outfile=tag_filename)
os.remove(bedpe_filename)
def tn5_shift_tagalign(tag_filename, shitfted_tag_filename):
'''Shift tagalign file for Tn5.'''
steps = [
"zcat -f %s" % (tag_filename),
r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""",
"gzip -nc"]
out, err = utils.run_pipe(steps, outfile=shitfted_tag_filename)
def main():
args = get_args()
paired = args.paired
bam = args.bam
chrm = args.chrm
tn5 = args.tn5
# Create a file handler
handler = logging.FileHandler('convert_reads.log')
......@@ -149,28 +131,16 @@ def main():
# Check if tools are present
check_tools()
# Convert PE or SE to tagAlign
# Get bam basename
bam_basename = os.path.basename(
utils.strip_extensions(bam, ['.bam']))
utils.strip_extensions(bam, ['.bam', '.dedup']))
tag_filename = bam_basename + '.tagAlign.gz'
shitfted_tag_filename = bam_basename + ".tn5.tagAlign.gz"
if paired: # paired-end data
convert_mapped_pe(bam, bam_basename, tag_filename, chrm)
convert_mapped_pe(bam, bam_basename, tag_filename)
else:
convert_mapped(bam, tag_filename, chrm)
if tn5:
logger.info("TN5-shifting TAGALIGN...")
tn5_shift_tagalign(tag_filename, shitfted_tag_filename)
else:
tn5_shift = ["cp %s %s" % (tag_filename, shitfted_tag_filename)]
out, err = utils.run_pipe(tn5_shift)
os.remove(tag_filename)
convert_mapped(bam, tag_filename)
if __name__ == '__main__':
main()
......@@ -7,14 +7,11 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/convertReads/'
@pytest.mark.integration
def test_convert_reads_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCFF115PAE.filt.nodup.tagAlign.gz'))
assert os.path.exists(os.path.join(test_output_path, 'ENCFF115PAE.filt.nodup.bedse.gz'))
@pytest.mark.singleend_human
def test_convert_reads_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB622FZX.tagAlign.gz'))
@pytest.mark.integration
def test_map_qc_pairedend():
# Do the same thing for paired end data
# Also check that bedpe exists
pass
@pytest.mark.pairedend_mouse
def test_convert_reads_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB749GLW.tagAlign.gz'))
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment