Skip to content
Snippets Groups Projects
Commit a14bd619 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '15-FixFilterReads' into 'master'

fix filter reads

Closes #15

See merge request !9
parents 2074cdc8 4cadae4e
1 merge request!9fix filter reads
Pipeline #5289 passed with stages
in 9 hours, 44 minutes, and 4 seconds
# Astrocyte ATAC-seq analysis Workflow Package
[![Build Status](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/badges/master/build.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
[![Coverage Report](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/badges/master/coverage.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
[![Build Status](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/badges/master/build.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
[![Coverage Report](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/badges/master/coverage.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.24.0-brightgreen.svg
)](https://www.nextflow.io/)
[![Astrocyte](https://img.shields.io/badge/astrocyte-%E2%89%A50.1.0-blue.svg)](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html)
......@@ -47,6 +47,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
1. Check input files
2. Trim adaptors with TrimGalore!
3. Map reads with BWA, filter with SamTools, and sort with Sambamba
4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
## Output Files
......@@ -55,6 +56,14 @@ Folder | File | Description
design | N/A | Inputs used for analysis; can ignore
trimReads | *_trimming_report.txt | report detailing how many reads were trimmed
trimReads | *_trimmed.fq.gz | trimmed fastq files used for analysis
alignReads | *.flagstat.qc | QC metrics from the mapping process
alignReads | *.bam | sorted bam file
filterReads | *.dedup.qc | QC metrics of find duplicate reads (sambamba)
filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
filterReads | *.dedup.bam.bai | indexed filtered bam file
filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
filterReads | *.dedup.pbc.qc | QC metrics of library complexity
## Common Errors
......
......@@ -156,7 +156,7 @@ process alignReads {
output:
set sampleId, file('*.bam'), experimentId, replicate into mappedReads
file '*.flagstat.qc' into mappedReadsStats
file('version_*.txt') into alignReadsVersions
script:
if (params.astrocyte == true) {
......@@ -197,34 +197,54 @@ process alignReads {
// Dedup reads using sambamba
process filterReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
tag "${sampleId}-${replicate}"
publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'
input:
set sampleId, mapped, experimentId, replicate from mappedReads
set sampleId, mapped, experimentId, replicate from mappedReads
output:
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into dedupReads
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into tssEnrich
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into convertReads
file '*flagstat.qc' into dedupReadsStats
file '*pbc.qc' into dedupReadsComplexity
file '*dup.qc' into dupReads
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into dedupReads
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into tssEnrich
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into convertReads
file '*flagstat.qc' into dedupReadsStats
file '*pbc.qc' into dedupReadsComplexity
file '*dup.qc' into dupReads
file('version_*.txt') into filterReadsVersions
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_qc.py -b $mapped -p
"""
}
else {
"""
python3 $baseDir/scripts/map_qc.py -b $mapped
"""
}
if (params.astrocyte == true) {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.4.1
module load sambamba/0.6.6
module load bedtools/2.26.0
python3 ${baseDir}/scripts/map_qc.py -b ${mapped} -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.4.1
module load sambamba/0.6.6
module load bedtools/2.26.0
python3 ${baseDir}/scripts/map_qc.py -b ${mapped}
"""
}
}
else {
if (pairedEnd) {
"""
python3 ${baseDir}/scripts/map_qc.py -b ${mapped} -p
"""
}
else {
"""
python3 ${baseDir}/scripts/map_qc.py -b ${mapped}
"""
}
}
}
......
......@@ -12,7 +12,6 @@ from multiprocessing import cpu_count
import pandas as pd
import utils
EPILOG = '''
For more details:
%(prog)s --help
......@@ -63,6 +62,15 @@ def check_tools():
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
......@@ -70,6 +78,18 @@ def check_tools():
sambamba_path = shutil.which("sambamba")
if sambamba_path:
logger.info('Found sambamba: %s', sambamba_path)
# Get Version
sambamba_version_command = "sambamba"
try:
subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
sambamba_version = e.output
# Write to file
sambamba_file = open("version_sambamba.txt", "wb")
sambamba_file.write(sambamba_version)
sambamba_file.close()
else:
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
......@@ -77,6 +97,15 @@ def check_tools():
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
......@@ -168,9 +197,8 @@ def dedup_mapped(bam, bam_basename, paired):
shlex.split(sambamba_markdup_command),
stderr=dup_qc)
# Remove duplicates
final_bam_prefix = bam_basename + ".filt.nodup"
final_bam_prefix = bam_basename + ".dedup"
final_bam_filename = final_bam_prefix + ".bam"
if paired: # paired-end data
......@@ -207,7 +235,7 @@ def dedup_mapped(bam, bam_basename, paired):
def compute_complexity(bam, paired, bam_basename):
'''Calculate library complexity .'''
pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc"
pbc_file_qc_filename = bam_basename + ".pbc.qc"
tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)
# Sort by name
......@@ -230,17 +258,20 @@ def compute_complexity(bam, paired, bam_basename):
'TwoReadPairs',
'NRF',
'PBC1',
'PBC2']
'PBC2',
]
if paired:
steps = [
"samtools sort -@%d -n %s" % (cpu_count(), bam),
"bamToBed -bedpe -i stdin",
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'""",
]
else:
steps = [
"bamToBed -i %s" % (bam),
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'""",
]
steps.extend([
"grep -v 'chrM'",
"sort",
......@@ -251,9 +282,13 @@ def compute_complexity(bam, paired, bam_basename):
if err:
logger.error("PBC file error: %s", err)
# Add headers
# Add Sample Name and headers
pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
names=pbc_headers)
pbc_file['Sample'] = bam_basename
pbc_headers_new = list(pbc_file)
pbc_headers_new.insert(0, pbc_headers_new.pop(pbc_headers_new.index('Sample')))
pbc_file = pbc_file[pbc_headers_new]
pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
os.remove(bam)
os.remove(bam + '.bai')
......
......@@ -8,23 +8,45 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/filterReads/'
@pytest.mark.integration
def test_map_qc_singleend():
#assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam'))
#assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai'))
#filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc'
#samtools_report = open(filtered_reads_report).readlines()
#assert '64962570 + 0 in total' in samtools_report[0]
#assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4]
#library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc'
#df_library_complexity = pd.read_csv(library_complexity, sep='\t')
#assert df_library_complexity["NRF"].iloc[0] == 0.926192
#assert df_library_complexity["PBC1"].iloc[0] == 0.926775
#assert df_library_complexity["PBC2"].iloc[0] == 13.706885
pass
@pytest.mark.integration
def test_map_qc_pairedend():
# Do the same thing for paired end data
pass
@pytest.mark.singleend_human
def test_align_reads_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.bam'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.bam.bai'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.qc'))
@pytest.mark.singleend_human
def test_align_reads_singleend_human():
filtered_reads_report = test_output_path + 'ENCLB969KTX/ENCLB969KTX.dedup.flagstat.qc'
samtools_report = open(filtered_reads_report).readlines()
assert '28118054 + 0 in total' in samtools_report[0]
assert '28118054 + 0 mapped (100.00%:N/A)' in samtools_report[4]
@pytest.mark.singleend
def test_library_complexity_singleend():
library_complexity = test_output_path + 'ENCLB969KTX/ENCLB969KTX.pbc.qc'
df_library_complexity = pd.read_csv(library_complexity, sep='\t')
assert df_library_complexity["NRF"].iloc[0] == 0.608847
assert df_library_complexity["PBC1"].iloc[0] == 0.641176
assert df_library_complexity["PBC2"].iloc[0] == 2.887092
@pytest.mark.pairedend_mouse
def test_align_reads_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.bam'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.bam.bai'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.qc'))
@pytest.mark.pairedend_mouse
def test_align_reads_pairedend_mouse():
filtered_reads_report = test_output_path + 'ENCLB122XDP/ENCLB122XDP.dedup.flagstat.qc'
samtools_report = open(filtered_reads_report).readlines()
assert '34342124 + 0 in total' in samtools_report[0]
assert '34342124 + 0 mapped (100.00%:N/A)' in samtools_report[4]
@pytest.mark.pairedend_mouse
def test_align_reads_pairedend_mouse():
library_complexity = test_output_path + 'ENCLB122XDP/ENCLB122XDP.pbc.qc'
df_library_complexity = pd.read_csv(library_complexity, sep='\t')
assert df_library_complexity["NRF"].iloc[0] == 0.912926
assert round(df_library_complexity["PBC1"].iloc[0],6) == 0.913898
assert round(df_library_complexity["PBC2"].iloc[0],6) == 11.654689
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment