Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
chipseq_analysis
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Requirements
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Astrocyte
Workflows
BICF
chipseq_analysis
Commits
3a9a868b
Commit
3a9a868b
authored
7 years ago
by
Venkat Malladi
Browse files
Options
Downloads
Patches
Plain Diff
Add in steps to remove dups and filter reads.
parent
4357eb15
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
astrocyte_pkg.yml
+3
-0
3 additions, 0 deletions
astrocyte_pkg.yml
workflow/conf/biohpc.config
+4
-0
4 additions, 0 deletions
workflow/conf/biohpc.config
workflow/main.nf
+32
-1
32 additions, 1 deletion
workflow/main.nf
workflow/scripts/map_qc.py
+274
-0
274 additions, 0 deletions
workflow/scripts/map_qc.py
with
313 additions
and
1 deletion
astrocyte_pkg.yml
+
3
−
0
View file @
3a9a868b
...
...
@@ -44,6 +44,9 @@ workflow_modules:
-
'
cutadapt/1.9.1'
-
'
fastqc/0.11.2'
-
'
bwa/intel/0.7.12'
-
'
samtools/intel/1.3'
-
'
sambamba/0.6.6'
-
'
bedtools/2.26.0'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
...
...
This diff is collapsed.
Click to expand it.
workflow/conf/biohpc.config
+
4
−
0
View file @
3a9a868b
...
...
@@ -15,6 +15,10 @@ process {
module
= [
'python/3.6.1-2-anaconda'
,
'bwa/intel/0.7.12'
,
'samtools/intel/1.3'
]
cpus
=
32
}
$
filterReads
{
module
= [
'python/3.6.1-2-anaconda'
,
'samtools/intel/1.3'
,
'sambamba/0.6.6'
,
'bedtools/2.26.0'
]
cpus
=
32
}
}
params
{
...
...
This diff is collapsed.
Click to expand it.
workflow/main.nf
+
32
−
1
View file @
3a9a868b
...
...
@@ -114,7 +114,7 @@ process alignReads {
output:
set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads
set
file
(
'*.srt.bam.flagstat.qc'
)
file
'*.srt.bam.flagstat.qc'
into mappedReadsStats
script:
...
...
@@ -130,3 +130,34 @@ process alignReads {
}
}
// Dedup reads using sambamba
process filterReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
set sampleId, mapped, biosample, factor, treatment, replicate, controlId from mappedReads
output:
set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into dedupReads
set file('*flagstat.qc') into dedupReadsStats
set file('*pbc.qc') into dedupReadsComplexity
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_qc.py -f $mapped -p
"""
}
else {
"""
python3 $baseDir/scripts/map_qc.py -f $mapped
"""
}
}
This diff is collapsed.
Click to expand it.
workflow/scripts/map_qc.py
0 → 100644
+
274
−
0
View file @
3a9a868b
#!/usr/bin/env python3
'''
Remove duplicates and filter unmapped reads.
'''
import
os
import
subprocess
import
argparse
import
shutil
import
shlex
import
logging
from
multiprocessing
import
cpu_count
import
json
import
utils
EPILOG
=
'''
For more details:
%(prog)s --help
'''
# SETTINGS
logger
=
logging
.
getLogger
(
__name__
)
logger
.
addHandler
(
logging
.
NullHandler
())
logger
.
propagate
=
False
logger
.
setLevel
(
logging
.
INFO
)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS
=
[
'
.bam
'
,
'
.srt
'
]
def
get_args
():
'''
Define arguments.
'''
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
,
epilog
=
EPILOG
,
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
)
parser
.
add_argument
(
'
-b
'
,
'
--bam
'
,
help
=
"
The bam file to run filtering and qc on.
"
,
required
=
True
)
parser
.
add_argument
(
'
-p
'
,
'
--paired
'
,
help
=
"
True/False if paired-end or single end.
"
,
default
=
False
,
action
=
'
store_true
'
)
args
=
parser
.
parse_args
()
return
args
# Functions
def
check_tools
():
'''
Checks for required componenets on user system
'''
logger
.
info
(
'
Checking for required libraries and components on this system
'
)
samtools_path
=
shutil
.
which
(
"
samtools
"
)
if
samtools_path
:
logger
.
info
(
'
Found samtools: %s
'
,
samtools_path
)
else
:
logger
.
error
(
'
Missing samtools
'
)
raise
Exception
(
'
Missing samtools
'
)
sambamba_path
=
shutil
.
which
(
"
sambamba
"
)
if
bwa_path
:
logger
.
info
(
'
Found sambamba: %s
'
,
bwa_path
)
else
:
logger
.
error
(
'
Missing sambamba
'
)
raise
Exception
(
'
Missing sambamba
'
)
bedtools_path
=
shutil
.
which
(
"
bedtools
"
)
if
bwa_path
:
logger
.
info
(
'
Found sambamba: %s
'
,
bwa_path
)
else
:
logger
.
error
(
'
Missing sambamba
'
)
raise
Exception
(
'
Missing sambamba
'
)
def
filter_mapped_pe
(
bam
,
bam_basename
):
'''
Use samtools to filter unmapped reads for PE data.
'''
filt_bam_prefix
=
bam_basename
+
"
.filt.srt
"
filt_bam_filename
=
filt_bam_prefix
+
"
.bam
"
tmp_filt_bam_prefix
=
"
tmp.%s
"
%
(
filt_bam_prefix
)
tmp_filt_bam_filename
=
tmp_filt_bam_prefix
+
"
.bam
"
# Remove unmapped, mate unmapped
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Only keep properly paired reads
# Obtain name sorted BAM file
out
,
err
=
common
.
run_pipe
([
# filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
# -q 30 exclude MAPQ < 30; -u uncompressed output
# exclude FLAG 1804: unmapped, next segment unmapped, secondary
# alignments, not passing platform q, PCR or optical duplicates
# require FLAG 2: properly aligned
"
samtools view -F 1804 -f 2 -q 30 -u %s
"
%
(
bam
),
# sort: -n sort by name; - take input from stdin;
# out to specified filename
# Will produce name sorted BAM
"
samtools sort -n -@%d -o %s
"
%
(
cpu_count
(),
tmp_filt_bam_filename
)])
if
err
:
logger
.
error
(
"
samtools filter error: %s
"
%
(
err
))
# Remove orphan reads (pair was removed)
# and read pairs mapping to different chromosomes
# Obtain position sorted BAM
out
,
err
=
common
.
run_pipe
([
# fill in mate coordinates, ISIZE and mate-related flags
# fixmate requires name-sorted alignment; -r removes secondary and
# unmapped (redundant here because already done above?)
# - send output to stdout
"
samtools fixmate -r %s -
"
%
(
tmp_filt_bam_filename
),
# repeat filtering after mate repair
"
samtools view -F 1804 -f 2 -u -
"
,
# produce the coordinate-sorted BAM
"
samtools sort -@%d -o %s
"
%
(
cpu_count
(),
filt_bam_filename
)])
os
.
remove
(
tmp_filt_bam_filename
)
return
filt_bam_filename
def
filter_mapped_se
(
bam
,
bam_basename
):
'''
Use samtools to filter unmapped reads for SE data.
'''
filt_bam_prefix
=
bam_basename
+
"
.filt.srt
"
filt_bam_filename
=
filt_bam_prefix
+
"
.bam
"
# Remove unmapped, mate unmapped
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Obtain name sorted BAM file
with
open
(
filt_bam_filename
,
'
w
'
)
as
fh
:
samtools_filter_command
=
(
"
samtools view -F 1804 -q 30 -b %s
"
%
(
samtools_params
,
input_bam
)
)
logger
.
info
(
samtools_filter_command
)
subprocess
.
check_call
(
shlex
.
split
(
samtools_filter_command
),
stdout
=
fh
)
return
filt_bam_filename
def
dedup_mapped
(
bam
,
bam_basename
,
paired
):
'''
Use sambamba and samtools to remove duplicates.
'''
# Markduplicates
dup_file_qc_filename
=
bam_basename
+
"
.dup.qc
"
tmp_dup_mark_filename
=
bam_basename
+
"
.dupmark.bam
"
sambamba_parms
=
"
--hash-table-size=17592186044416
\
--overflow-list-size=20000000 --io-buffer-size=256
"
with
open
(
dup_file_qc_filename
,
'
w
'
)
as
fh
:
sambamba_markdup_command
=
(
"
sambamba markdup -t %d %s %s %s
"
%
(
cpu_count
,
sambamba_parms
,
bam
,
tmp_dup_mark_filename
)
)
logger
.
info
(
sambamba_markdup_command
)
subprocess
.
check_call
(
shlex
.
split
(
sambamba_markdup
),
stdout
=
fh
)
os
.
remove
(
tmp_dup_mark_filename
+
'
.bai
'
)
# Remove duplicates
final_bam_prefix
=
bam_basename
+
"
.filt.nodup
"
final_bam_filename
=
final_bam_prefix
+
"
.bam
"
if
paired
:
# paired-end data
samtools_dedupe_command
=
\
"
samtools view -F 1804 -f 2 -b %s
"
%
(
filt_bam_filename
)
else
:
samtools_dedupe_command
=
\
"
samtools view -F 1804 -b %s
"
%
(
filt_bam_filename
)
with
open
(
final_bam_filename
,
'
w
'
)
as
fh
:
logger
.
info
(
samtools_dedupe_command
)
subprocess
.
check_call
(
shlex
.
split
(
samtools_dedupe_command
),
stdout
=
fh
)
# Index final bam file
sambamba_index_command
=
\
"
samtools index -t %d %s
"
%
(
cpu_count
(),
final_bam_filename
)
logger
.
info
(
sambamba_index_command
)
subprocess
.
check_output
(
shlex
.
split
(
sambamba_index_command
))
# Generate mapping statistics
final_bam_file_mapstats_filename
=
final_bam_prefix
+
"
.flagstat.qc
"
with
open
(
final_bam_file_mapstats_filename
,
'
w
'
)
as
fh
:
flagstat_command
=
"
sambamba flagstat -t %d %s
"
%
(
cpu_count
(),
final_bam_filename
)
logger
.
info
(
flagstat_command
)
subprocess
.
check_call
(
shlex
.
split
(
flagstat_command
),
stdout
=
fh
)
os
.
remove
(
filt_bam_filename
)
return
final_bam_filename
def
compute_complexity
(
bam
,
paired
,
bam_basename
):
'''
Calculate library complexity .
'''
pbc_file_qc_filename
=
bam_basename
+
"
filt.nodup.pbc.qc
"
# Sort by name
# convert to bedPE and obtain fragment coordinates
# sort by position and strand
# Obtain unique count statistics
# PBC File output
# TotalReadPairs [tab]
# DistinctReadPairs [tab]
# OneReadPair [tab]
# TwoReadPairs [tab]
# NRF=Distinct/Total [tab]
# PBC1=OnePair/Distinct [tab]
# PBC2=OnePair/TwoPair
if
paired
:
steps
=
[
"
sambamba sort -t %d -n %s
"
%
(
cpu_count
(),
bam
),
"
bamToBed -bedpe -i stdin
"
,
r
"""
awk
'
BEGIN{OFS=
"
\t
"
}{print $1,$2,$4,$6,$9,$10}
'"""
]
else
:
steps
=
[
"
bamToBed -i %s
"
%
(
bam
),
r
"""
awk
'
BEGIN{OFS=
"
\t
"
}{print $1,$2,$3,$6}
'"""
]
steps
.
extend
([
"
grep -v
'
chrM
'"
,
"
sort
"
,
"
uniq -c
"
,
r
"""
awk
'
BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf
"
%d\t%d\t%d\t%d\t%f\t%f\t%f\n
"
,mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}
'"""
])
out
,
err
=
common
.
run_pipe
(
steps
,
pbc_file_qc_filename
)
if
err
:
logger
.
error
(
"
PBC file error: %s
"
%
(
err
))
def
main
():
args
=
get_args
()
paired
=
args
.
paired
bam
=
args
.
bam
# Create a file handler
handler
=
logging
.
FileHandler
(
'
filter_qc.log
'
)
logger
.
addHandler
(
handler
)
# Check if tools are present
check_tools
()
# Run filtering for either PE or SE
bam_basename
=
os
.
path
.
basename
(
utils
.
strip_extensions
(
bam
,
STRIP_EXTENSIONS
))
if
paired
:
# paired-end data
filter_bam_filename
=
filter_mapped_pe
(
bam
,
bam_basename
)
else
:
filter_bam_filename
=
filter_mapped_se
(
bam
,
bam_basename
)
# Remove duplicates
dedup_bam_filename
=
dedup_mapped
(
filter_bam_filename
,
bam_basename
,
paired
)
# Compute library complexity
compute_complexity
(
dedup_bam_filename
,
paired
,
bam_basename
)
if
__name__
==
'
__main__
'
:
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment