Skip to content
Snippets Groups Projects

fix filter reads

Merged Holly Ruess requested to merge 15-FixFilterReads into master
Compare and
4 files
+ 139
53
Preferences
File browser
Compare changes
+ 43
8
@@ -12,7 +12,6 @@ from multiprocessing import cpu_count
@@ -12,7 +12,6 @@ from multiprocessing import cpu_count
import pandas as pd
import pandas as pd
import utils
import utils
EPILOG = '''
EPILOG = '''
For more details:
For more details:
%(prog)s --help
%(prog)s --help
@@ -63,6 +62,15 @@ def check_tools():
@@ -63,6 +62,15 @@ def check_tools():
samtools_path = shutil.which("samtools")
samtools_path = shutil.which("samtools")
if samtools_path:
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
logger.info('Found samtools: %s', samtools_path)
 
 
# Get Version
 
samtools_version_command = "samtools --version"
 
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
 
 
# Write to file
 
samtools_file = open("version_samtools.txt", "wb")
 
samtools_file.write(samtools_version)
 
samtools_file.close()
else:
else:
logger.error('Missing samtools')
logger.error('Missing samtools')
raise Exception('Missing samtools')
raise Exception('Missing samtools')
@@ -70,6 +78,18 @@ def check_tools():
@@ -70,6 +78,18 @@ def check_tools():
sambamba_path = shutil.which("sambamba")
sambamba_path = shutil.which("sambamba")
if sambamba_path:
if sambamba_path:
logger.info('Found sambamba: %s', sambamba_path)
logger.info('Found sambamba: %s', sambamba_path)
 
 
# Get Version
 
sambamba_version_command = "sambamba"
 
try:
 
subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
 
except subprocess.CalledProcessError as e:
 
sambamba_version = e.output
 
 
# Write to file
 
sambamba_file = open("version_sambamba.txt", "wb")
 
sambamba_file.write(sambamba_version)
 
sambamba_file.close()
else:
else:
logger.error('Missing sambamba')
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
raise Exception('Missing sambamba')
@@ -77,6 +97,15 @@ def check_tools():
@@ -77,6 +97,15 @@ def check_tools():
bedtools_path = shutil.which("bedtools")
bedtools_path = shutil.which("bedtools")
if bedtools_path:
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
logger.info('Found bedtools: %s', bedtools_path)
 
 
# Get Version
 
bedtools_version_command = "bedtools --version"
 
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
 
 
# Write to file
 
bedtools_file = open("version_bedtools.txt", "wb")
 
bedtools_file.write(bedtools_version)
 
bedtools_file.close()
else:
else:
logger.error('Missing bedtools')
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
raise Exception('Missing bedtools')
@@ -168,9 +197,8 @@ def dedup_mapped(bam, bam_basename, paired):
@@ -168,9 +197,8 @@ def dedup_mapped(bam, bam_basename, paired):
shlex.split(sambamba_markdup_command),
shlex.split(sambamba_markdup_command),
stderr=dup_qc)
stderr=dup_qc)
# Remove duplicates
# Remove duplicates
final_bam_prefix = bam_basename + ".filt.nodup"
final_bam_prefix = bam_basename + ".dedup"
final_bam_filename = final_bam_prefix + ".bam"
final_bam_filename = final_bam_prefix + ".bam"
if paired: # paired-end data
if paired: # paired-end data
@@ -207,7 +235,7 @@ def dedup_mapped(bam, bam_basename, paired):
@@ -207,7 +235,7 @@ def dedup_mapped(bam, bam_basename, paired):
def compute_complexity(bam, paired, bam_basename):
def compute_complexity(bam, paired, bam_basename):
'''Calculate library complexity .'''
'''Calculate library complexity .'''
pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc"
pbc_file_qc_filename = bam_basename + ".pbc.qc"
tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)
tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)
# Sort by name
# Sort by name
@@ -230,17 +258,20 @@ def compute_complexity(bam, paired, bam_basename):
@@ -230,17 +258,20 @@ def compute_complexity(bam, paired, bam_basename):
'TwoReadPairs',
'TwoReadPairs',
'NRF',
'NRF',
'PBC1',
'PBC1',
'PBC2']
'PBC2',
 
]
if paired:
if paired:
steps = [
steps = [
"samtools sort -@%d -n %s" % (cpu_count(), bam),
"samtools sort -@%d -n %s" % (cpu_count(), bam),
"bamToBed -bedpe -i stdin",
"bamToBed -bedpe -i stdin",
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'""",
 
]
else:
else:
steps = [
steps = [
"bamToBed -i %s" % (bam),
"bamToBed -i %s" % (bam),
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'""",
 
]
steps.extend([
steps.extend([
"grep -v 'chrM'",
"grep -v 'chrM'",
"sort",
"sort",
@@ -251,9 +282,13 @@ def compute_complexity(bam, paired, bam_basename):
@@ -251,9 +282,13 @@ def compute_complexity(bam, paired, bam_basename):
if err:
if err:
logger.error("PBC file error: %s", err)
logger.error("PBC file error: %s", err)
# Add headers
# Add Sample Name and headers
pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
names=pbc_headers)
names=pbc_headers)
 
pbc_file['Sample'] = bam_basename
 
pbc_headers_new = list(pbc_file)
 
pbc_headers_new.insert(0, pbc_headers_new.pop(pbc_headers_new.index('Sample')))
 
pbc_file = pbc_file[pbc_headers_new]
pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
os.remove(bam)
os.remove(bam)
os.remove(bam + '.bai')
os.remove(bam + '.bai')