Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
atacseq_analysis
Manage
Activity
Members
Labels
Plan
Issues
7
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
BICF
Astrocyte
atacseq_analysis
Commits
a14bd619
Commit
a14bd619
authored
5 years ago
by
Venkat Malladi
Browse files
Options
Downloads
Plain Diff
Merge branch '15-FixFilterReads' into 'master'
fix filter reads Closes
#15
See merge request
!9
parents
2074cdc8
4cadae4e
1 merge request
!9
fix filter reads
Pipeline
#5289
passed with stages
in 9 hours, 44 minutes, and 4 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
README.md
+11
-2
11 additions, 2 deletions
README.md
workflow/main.nf
+43
-23
43 additions, 23 deletions
workflow/main.nf
workflow/scripts/map_qc.py
+43
-8
43 additions, 8 deletions
workflow/scripts/map_qc.py
workflow/tests/test_map_qc.py
+42
-20
42 additions, 20 deletions
workflow/tests/test_map_qc.py
with
139 additions
and
53 deletions
README.md
+
11
−
2
View file @
a14bd619
# Astrocyte ATAC-seq analysis Workflow Package
[

](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
[

](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
[

](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
[

](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
[

](https://www.nextflow.io/)
[

](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html)
...
...
@@ -47,6 +47,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
1.
Check input files
2.
Trim adaptors with TrimGalore!
3.
Map reads with BWA, filter with SamTools, and sort with Sambamba
4.
Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
## Output Files
...
...
@@ -55,6 +56,14 @@ Folder | File | Description
design | N/A | Inputs used for analysis; can ignore
trimReads |
*
_trimming_
report.txt | report detailing how many reads were trimmed
trimReads |
*
_trimmed.fq.gz | trimmed fastq files used for analysis
alignReads |
*
.flagstat.qc | QC metrics from the mapping process
alignReads |
*
.bam | sorted bam file
filterReads |
*
.dedup.qc | QC metrics of find duplicate reads (sambamba)
filterReads |
*
.dedup.bam | filtered bam file with duplicate reads removed
filterReads |
*
.dedup.bam.bai | indexed filtered bam file
filterReads |
*
.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
filterReads |
*
.dedup.pbc.qc | QC metrics of library complexity
## Common Errors
...
...
This diff is collapsed.
Click to expand it.
workflow/main.nf
+
43
−
23
View file @
a14bd619
...
...
@@ -156,7 +156,7 @@ process alignReads {
output:
set sampleId, file('*.bam'), experimentId, replicate into mappedReads
file '*.flagstat.qc' into mappedReadsStats
file('version_*.txt') into alignReadsVersions
script:
if (params.astrocyte == true) {
...
...
@@ -197,34 +197,54 @@ process alignReads {
// Dedup reads using sambamba
process filterReads {
tag "$sampleId-$replicate"
publishDir "$
baseDir/output
/${task.process}", mode: 'copy'
tag "$
{
sampleId
}
-$
{
replicate
}
"
publishDir "$
{outDir}
/${task.process}
/${sampleId}
", mode: 'copy'
input:
set sampleId, mapped, experimentId, replicate from mappedReads
set sampleId, mapped, experimentId, replicate from mappedReads
output:
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into
dedupReads
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into
tssEnrich
set sampleId,
file
(
'*
.bam'), file('*.bai'), experimentId, replicate into convertRead
s
file '*
flagstat
.qc' into dedupReads
Stats
file '*
pbc
.qc' into
de
dupReads
Complexity
file
'*dup.qc' into dupRead
s
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into dedupReads
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into
tssEnrich
set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into
convertReads
file
'*
flagstat.qc' into dedupReadsStat
s
file '*
pbc
.qc' into dedupReads
Complexity
file '*
dup
.qc' into dupReads
file
('version_*.txt') into filterReadsVersion
s
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_qc.py -b $mapped -p
"""
}
else {
"""
python3 $baseDir/scripts/map_qc.py -b $mapped
"""
}
if (params.astrocyte == true) {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.4.1
module load sambamba/0.6.6
module load bedtools/2.26.0
python3 ${baseDir}/scripts/map_qc.py -b ${mapped} -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.4.1
module load sambamba/0.6.6
module load bedtools/2.26.0
python3 ${baseDir}/scripts/map_qc.py -b ${mapped}
"""
}
}
else {
if (pairedEnd) {
"""
python3 ${baseDir}/scripts/map_qc.py -b ${mapped} -p
"""
}
else {
"""
python3 ${baseDir}/scripts/map_qc.py -b ${mapped}
"""
}
}
}
...
...
This diff is collapsed.
Click to expand it.
workflow/scripts/map_qc.py
+
43
−
8
View file @
a14bd619
...
...
@@ -12,7 +12,6 @@ from multiprocessing import cpu_count
import
pandas
as
pd
import
utils
EPILOG
=
'''
For more details:
%(prog)s --help
...
...
@@ -63,6 +62,15 @@ def check_tools():
samtools_path
=
shutil
.
which
(
"
samtools
"
)
if
samtools_path
:
logger
.
info
(
'
Found samtools: %s
'
,
samtools_path
)
# Get Version
samtools_version_command
=
"
samtools --version
"
samtools_version
=
subprocess
.
check_output
(
samtools_version_command
,
shell
=
True
)
# Write to file
samtools_file
=
open
(
"
version_samtools.txt
"
,
"
wb
"
)
samtools_file
.
write
(
samtools_version
)
samtools_file
.
close
()
else
:
logger
.
error
(
'
Missing samtools
'
)
raise
Exception
(
'
Missing samtools
'
)
...
...
@@ -70,6 +78,18 @@ def check_tools():
sambamba_path
=
shutil
.
which
(
"
sambamba
"
)
if
sambamba_path
:
logger
.
info
(
'
Found sambamba: %s
'
,
sambamba_path
)
# Get Version
sambamba_version_command
=
"
sambamba
"
try
:
subprocess
.
check_output
(
sambamba_version_command
,
shell
=
True
,
stderr
=
subprocess
.
STDOUT
)
except
subprocess
.
CalledProcessError
as
e
:
sambamba_version
=
e
.
output
# Write to file
sambamba_file
=
open
(
"
version_sambamba.txt
"
,
"
wb
"
)
sambamba_file
.
write
(
sambamba_version
)
sambamba_file
.
close
()
else
:
logger
.
error
(
'
Missing sambamba
'
)
raise
Exception
(
'
Missing sambamba
'
)
...
...
@@ -77,6 +97,15 @@ def check_tools():
bedtools_path
=
shutil
.
which
(
"
bedtools
"
)
if
bedtools_path
:
logger
.
info
(
'
Found bedtools: %s
'
,
bedtools_path
)
# Get Version
bedtools_version_command
=
"
bedtools --version
"
bedtools_version
=
subprocess
.
check_output
(
bedtools_version_command
,
shell
=
True
)
# Write to file
bedtools_file
=
open
(
"
version_bedtools.txt
"
,
"
wb
"
)
bedtools_file
.
write
(
bedtools_version
)
bedtools_file
.
close
()
else
:
logger
.
error
(
'
Missing bedtools
'
)
raise
Exception
(
'
Missing bedtools
'
)
...
...
@@ -168,9 +197,8 @@ def dedup_mapped(bam, bam_basename, paired):
shlex
.
split
(
sambamba_markdup_command
),
stderr
=
dup_qc
)
# Remove duplicates
final_bam_prefix
=
bam_basename
+
"
.
filt.no
dup
"
final_bam_prefix
=
bam_basename
+
"
.
de
dup
"
final_bam_filename
=
final_bam_prefix
+
"
.bam
"
if
paired
:
# paired-end data
...
...
@@ -207,7 +235,7 @@ def dedup_mapped(bam, bam_basename, paired):
def
compute_complexity
(
bam
,
paired
,
bam_basename
):
'''
Calculate library complexity .
'''
pbc_file_qc_filename
=
bam_basename
+
"
.
filt.nodup.
pbc.qc
"
pbc_file_qc_filename
=
bam_basename
+
"
.pbc.qc
"
tmp_pbc_file_qc_filename
=
"
tmp.%s
"
%
(
pbc_file_qc_filename
)
# Sort by name
...
...
@@ -230,17 +258,20 @@ def compute_complexity(bam, paired, bam_basename):
'
TwoReadPairs
'
,
'
NRF
'
,
'
PBC1
'
,
'
PBC2
'
]
'
PBC2
'
,
]
if
paired
:
steps
=
[
"
samtools sort -@%d -n %s
"
%
(
cpu_count
(),
bam
),
"
bamToBed -bedpe -i stdin
"
,
r
"""
awk
'
BEGIN{OFS=
"
\t
"
}{print $1,$2,$4,$6,$9,$10}
'"""
]
r
"""
awk
'
BEGIN{OFS=
"
\t
"
}{print $1,$2,$4,$6,$9,$10}
'"""
,
]
else
:
steps
=
[
"
bamToBed -i %s
"
%
(
bam
),
r
"""
awk
'
BEGIN{OFS=
"
\t
"
}{print $1,$2,$3,$6}
'"""
]
r
"""
awk
'
BEGIN{OFS=
"
\t
"
}{print $1,$2,$3,$6}
'"""
,
]
steps
.
extend
([
"
grep -v
'
chrM
'"
,
"
sort
"
,
...
...
@@ -251,9 +282,13 @@ def compute_complexity(bam, paired, bam_basename):
if
err
:
logger
.
error
(
"
PBC file error: %s
"
,
err
)
# Add headers
# Add
Sample Name and
headers
pbc_file
=
pd
.
read_csv
(
tmp_pbc_file_qc_filename
,
sep
=
'
\t
'
,
header
=
None
,
names
=
pbc_headers
)
pbc_file
[
'
Sample
'
]
=
bam_basename
pbc_headers_new
=
list
(
pbc_file
)
pbc_headers_new
.
insert
(
0
,
pbc_headers_new
.
pop
(
pbc_headers_new
.
index
(
'
Sample
'
)))
pbc_file
=
pbc_file
[
pbc_headers_new
]
pbc_file
.
to_csv
(
pbc_file_qc_filename
,
header
=
True
,
sep
=
'
\t
'
,
index
=
False
)
os
.
remove
(
bam
)
os
.
remove
(
bam
+
'
.bai
'
)
...
...
This diff is collapsed.
Click to expand it.
workflow/tests/test_map_qc.py
+
42
−
20
View file @
a14bd619
...
...
@@ -8,23 +8,45 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'
/../output/filterReads/
'
@pytest.mark.integration
def
test_map_qc_singleend
():
#assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam'))
#assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai'))
#filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc'
#samtools_report = open(filtered_reads_report).readlines()
#assert '64962570 + 0 in total' in samtools_report[0]
#assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4]
#library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc'
#df_library_complexity = pd.read_csv(library_complexity, sep='\t')
#assert df_library_complexity["NRF"].iloc[0] == 0.926192
#assert df_library_complexity["PBC1"].iloc[0] == 0.926775
#assert df_library_complexity["PBC2"].iloc[0] == 13.706885
pass
@pytest.mark.integration
def
test_map_qc_pairedend
():
# Do the same thing for paired end data
pass
@pytest.mark.singleend_human
def
test_align_reads_singleend_human
():
assert
os
.
path
.
exists
(
os
.
path
.
join
(
test_output_path
,
'
ENCLB969KTX/ENCLB969KTX.dedup.bam
'
))
assert
os
.
path
.
exists
(
os
.
path
.
join
(
test_output_path
,
'
ENCLB969KTX/ENCLB969KTX.dedup.bam.bai
'
))
assert
os
.
path
.
exists
(
os
.
path
.
join
(
test_output_path
,
'
ENCLB969KTX/ENCLB969KTX.dedup.qc
'
))
@pytest.mark.singleend_human
def
test_align_reads_singleend_human
():
filtered_reads_report
=
test_output_path
+
'
ENCLB969KTX/ENCLB969KTX.dedup.flagstat.qc
'
samtools_report
=
open
(
filtered_reads_report
).
readlines
()
assert
'
28118054 + 0 in total
'
in
samtools_report
[
0
]
assert
'
28118054 + 0 mapped (100.00%:N/A)
'
in
samtools_report
[
4
]
@pytest.mark.singleend
def
test_library_complexity_singleend
():
library_complexity
=
test_output_path
+
'
ENCLB969KTX/ENCLB969KTX.pbc.qc
'
df_library_complexity
=
pd
.
read_csv
(
library_complexity
,
sep
=
'
\t
'
)
assert
df_library_complexity
[
"
NRF
"
].
iloc
[
0
]
==
0.608847
assert
df_library_complexity
[
"
PBC1
"
].
iloc
[
0
]
==
0.641176
assert
df_library_complexity
[
"
PBC2
"
].
iloc
[
0
]
==
2.887092
@pytest.mark.pairedend_mouse
def
test_align_reads_pairedend_mouse
():
assert
os
.
path
.
exists
(
os
.
path
.
join
(
test_output_path
,
'
ENCLB122XDP/ENCLB122XDP.dedup.bam
'
))
assert
os
.
path
.
exists
(
os
.
path
.
join
(
test_output_path
,
'
ENCLB122XDP/ENCLB122XDP.dedup.bam.bai
'
))
assert
os
.
path
.
exists
(
os
.
path
.
join
(
test_output_path
,
'
ENCLB122XDP/ENCLB122XDP.dedup.qc
'
))
@pytest.mark.pairedend_mouse
def
test_align_reads_pairedend_mouse
():
filtered_reads_report
=
test_output_path
+
'
ENCLB122XDP/ENCLB122XDP.dedup.flagstat.qc
'
samtools_report
=
open
(
filtered_reads_report
).
readlines
()
assert
'
34342124 + 0 in total
'
in
samtools_report
[
0
]
assert
'
34342124 + 0 mapped (100.00%:N/A)
'
in
samtools_report
[
4
]
@pytest.mark.pairedend_mouse
def
test_align_reads_pairedend_mouse
():
library_complexity
=
test_output_path
+
'
ENCLB122XDP/ENCLB122XDP.pbc.qc
'
df_library_complexity
=
pd
.
read_csv
(
library_complexity
,
sep
=
'
\t
'
)
assert
df_library_complexity
[
"
NRF
"
].
iloc
[
0
]
==
0.912926
assert
round
(
df_library_complexity
[
"
PBC1
"
].
iloc
[
0
],
6
)
==
0.913898
assert
round
(
df_library_complexity
[
"
PBC2
"
].
iloc
[
0
],
6
)
==
11.654689
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment