Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
Holly Ruess
celseq2
Commits
51c02b7d
Unverified
Commit
51c02b7d
authored
Apr 02, 2018
by
Yun YAN
Committed by
GitHub
Apr 02, 2018
Browse files
Merge pull request #16 from Puriney/master
0.4.8
parents
ec5d06b2
9e4ced0c
Changes
8
Hide whitespace changes
Inline
Side-by-side
celseq2/demultiplex.py
View file @
51c02b7d
...
...
@@ -77,6 +77,7 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
len_umi
=
6
,
len_bc
=
6
,
len_tx
=
35
,
bc_qual_min
=
10
,
is_gzip
=
True
,
save_unknown_bc_fastq
=
False
,
do_bc_rev_complement
=
False
,
do_tx_rev_complement
=
False
,
verbose
=
False
):
...
...
@@ -153,12 +154,13 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
try
:
fhout
=
bc_fhout
[
cell_bc
]
except
KeyError
:
fhout
=
bc_fhout
[
'UNKNOWNBC_R1'
]
fhout
.
write
(
'{}
\n
{}
\n
{}
\n
{}
\n
'
.
format
(
umibc_name
,
umibc_seq
,
"+"
,
umibc_qualstr
))
fhout
=
bc_fhout
[
'UNKNOWNBC_R2'
]
fhout
.
write
(
'{}
\n
{}
\n
{}
\n
{}
\n
'
.
format
(
tx_name
,
tx_seq
,
"+"
,
tx_qualstr
))
if
save_unknown_bc_fastq
:
fhout
=
bc_fhout
[
'UNKNOWNBC_R1'
]
fhout
.
write
(
'{}
\n
{}
\n
{}
\n
{}
\n
'
.
format
(
umibc_name
,
umibc_seq
,
"+"
,
umibc_qualstr
))
fhout
=
bc_fhout
[
'UNKNOWNBC_R2'
]
fhout
.
write
(
'{}
\n
{}
\n
{}
\n
{}
\n
'
.
format
(
tx_name
,
tx_seq
,
"+"
,
tx_qualstr
))
sample_counter
[
'unknown'
]
+=
1
continue
...
...
@@ -184,20 +186,20 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
def
write_demultiplexing
(
stats
,
dict_bc_id2seq
,
stats_fpath
):
if
stats_fpath
is
None
:
stats_fpath
=
'demultiplexing.
log
'
stats_fpath
=
'demultiplexing.
csv
'
try
:
fh_stats
=
open
(
stats_fpath
,
'w'
)
except
Exception
as
e
:
raise
Exception
(
e
)
fh_stats
.
write
(
'BC
\t
Reads(#)
\t
Reads(%)
\n
'
)
fh_stats
.
write
(
'BC
,
Reads(#)
,
Reads(%)
\n
'
)
for
bc_id
,
bc_seq
in
dict_bc_id2seq
.
items
():
# bc_id = '[{:04d}]'.format('-'.join(map(str, bc_id)))
formatter
=
'{:04d}-{}
\t
{}
\t
{:07.3f}
\n
'
formatter
=
'{:04d}-{}
,{},
{:07.3f}
\n
'
fh_stats
.
write
(
formatter
.
format
(
bc_id
,
bc_seq
,
stats
[
bc_seq
],
stats
[
bc_seq
]
/
stats
[
'total'
]
*
100
))
formatter
=
'{}
\t
{}
\t
{:07.3f}
\n
'
formatter
=
'{}
,{},
{:07.3f}
\n
'
fh_stats
.
write
(
formatter
.
format
(
'saved'
,
stats
[
'saved'
],
stats
[
'saved'
]
/
stats
[
'total'
]
*
100
))
fh_stats
.
write
(
formatter
.
format
(
'unknown'
,
stats
[
'unknown'
],
...
...
@@ -247,6 +249,9 @@ def main():
help
=
'Length of CELSeq barcode (default=6)'
)
parser
.
add_argument
(
'--cut-length'
,
metavar
=
'N'
,
type
=
int
,
default
=
35
,
help
=
'Length of read on R2 to be mapped. (default=35)'
)
parser
.
add_argument
(
'--save-unknown-bc-fastq'
,
dest
=
'save_unknown_bc_fastq'
,
action
=
'store_true'
)
parser
.
set_defaults
(
save_unknown_bc_fastq
=
False
)
parser
.
add_argument
(
'--verbose'
,
dest
=
'verbose'
,
action
=
'store_true'
)
parser
.
set_defaults
(
verbose
=
False
)
...
...
@@ -269,6 +274,7 @@ def main():
len_tx
=
args
.
cut_length
,
bc_qual_min
=
args
.
min_bc_quality
,
is_gzip
=
args
.
is_gzip
,
save_unknown_bc_fastq
=
args
.
save_unknown_bc_fastq
,
do_bc_rev_complement
=
False
,
do_tx_rev_complement
=
False
,
verbose
=
args
.
verbose
)
...
...
celseq2/dummy_species.py
View file @
51c02b7d
...
...
@@ -75,7 +75,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g0'
,
gene_name
=
'celseq_gene-0'
,
gene_biotype
=
'protein_coding'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g0'
,
gene_name
=
'celseq
2
_gene-0'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx0'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -88,7 +88,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g0'
,
gene_name
=
'celseq_gene-0'
,
gene_biotype
=
'protein_coding'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g0'
,
gene_name
=
'celseq
2
_gene-0'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx0'
,
exon_num
=
i
+
1
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -116,7 +116,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g8'
,
gene_name
=
'celseq_gene-8'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g8'
,
gene_name
=
'celseq
2
_gene-8'
,
transcript_id
=
'tx8'
,
gene_biotype
=
'lincRNA'
)
tx
=
gtf_str
(
chrm
=
'chr2'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -129,7 +129,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g8'
,
gene_name
=
'celseq_gene-8'
,
gene_biotype
=
'lincRNA'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g8'
,
gene_name
=
'celseq
2
_gene-8'
,
gene_biotype
=
'lincRNA'
,
transcript_id
=
'tx8'
,
exon_num
=
i
+
1
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr2'
,
src
=
src
,
...
...
@@ -157,7 +157,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g1'
,
gene_name
=
'celseq_gene-1'
,
gene_biotype
=
'miRNA'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g1'
,
gene_name
=
'celseq
2
_gene-1'
,
gene_biotype
=
'miRNA'
,
transcript_id
=
'tx1'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -170,7 +170,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
3
,
0
,
-
1
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g1'
,
gene_name
=
'celseq_gene-1'
,
gene_biotype
=
'miRNA'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g1'
,
gene_name
=
'celseq
2
_gene-1'
,
gene_biotype
=
'miRNA'
,
transcript_id
=
'tx1'
,
exon_num
=
i
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -198,7 +198,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
gene_biotype
=
'protein_coding'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq
2
_gene-2'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx2.1'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -211,7 +211,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
,
0
,
-
1
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
gene_biotype
=
'protein_coding'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq
2
_gene-2'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx2'
,
exon_num
=
i
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -226,7 +226,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
out
.
append
(
exon
)
s_stream
=
e_stream
+
1
+
len_intron
tx_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
gene_biotype
=
'protein_coding'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq
2
_gene-2'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx2.2'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -238,7 +238,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
tx_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
exon_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
gene_biotype
=
'protein_coding'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq
2
_gene-2'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx2.2'
,
exon_num
=
1
)
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'exon'
,
...
...
@@ -264,7 +264,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g3'
,
gene_name
=
'celseq_gene-3'
,
gene_biotype
=
'protein_coding'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g3'
,
gene_name
=
'celseq
2
_gene-3'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx3'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -277,7 +277,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g3'
,
gene_name
=
'celseq_gene-3'
,
gene_biotype
=
'protein_coding'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g3'
,
gene_name
=
'celseq
2
_gene-3'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx3'
,
exon_num
=
i
+
1
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -306,7 +306,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g4'
,
gene_name
=
'celseq_gene-4'
,
gene_biotype
=
'protein_coding'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g4'
,
gene_name
=
'celseq
2
_gene-4'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx4'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -319,7 +319,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g4'
,
gene_name
=
'celseq_gene-4'
,
gene_biotype
=
'protein_coding'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g4'
,
gene_name
=
'celseq
2
_gene-4'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx4'
,
exon_num
=
i
+
1
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -347,7 +347,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g5'
,
gene_name
=
'celseq_gene-5'
,
gene_biotype
=
'protein_coding'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g5'
,
gene_name
=
'celseq
2
_gene-5'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx5'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -360,7 +360,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
1
,
0
,
-
1
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g5'
,
gene_name
=
'celseq_gene-5'
,
gene_biotype
=
'protein_coding'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g5'
,
gene_name
=
'celseq
2
_gene-5'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx5'
,
exon_num
=
i
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -388,7 +388,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g6'
,
gene_name
=
'celseq_gene-6'
,
gene_biotype
=
'protein_coding'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g6'
,
gene_name
=
'celseq
2
_gene-6'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx6'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -400,7 +400,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
tx_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
exon_attr
=
gtf_attr_str
(
gene_id
=
'g6'
,
gene_name
=
'celseq_gene-6'
,
gene_biotype
=
'protein_coding'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g6'
,
gene_name
=
'celseq
2
_gene-6'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx6'
,
exon_num
=
1
)
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'exon'
,
...
...
@@ -424,7 +424,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g7'
,
gene_name
=
'celseq_gene-7'
,
gene_biotype
=
'protein_coding'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g7'
,
gene_name
=
'celseq
2
_gene-7'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx7'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -437,7 +437,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g7'
,
gene_name
=
'celseq_gene-7'
,
gene_biotype
=
'protein_coding'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g7'
,
gene_name
=
'celseq
2
_gene-7'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx7'
,
exon_num
=
i
+
1
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
celseq2/helper.py
View file @
51c02b7d
...
...
@@ -35,7 +35,7 @@ def ymdhms():
def
print_logger
(
msg
):
localtime
=
time
.
asctime
(
time
.
localtime
(
time
.
time
()))
# sys.stderr.write("[ {} ] {}\n".format(localtime, msg))
print
(
"[ {} ] {}
\n
"
.
format
(
localtime
,
msg
))
print
(
"[ {} ] {}
\n
"
.
format
(
localtime
,
msg
)
,
flush
=
True
)
def
mkfolder
(
dirpath
):
...
...
celseq2/prepare_annotation_model.py
View file @
51c02b7d
...
...
@@ -5,6 +5,7 @@ import argparse
import
pickle
from
celseq2.helper
import
print_logger
# from genometools.ensembl.annotations import get_genes
def
cook_anno_model
(
gff_fpath
,
feature_atrr
=
'gene_id'
,
feature_type
=
'exon'
,
...
...
@@ -50,8 +51,13 @@ def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon',
print_logger
(
'Processed {:,} lines of GFF...'
.
format
(
i
))
# Use genometools to select exported_genes
# if gene_types:
# exported_genes = get_genes(gff_fpath, valid_biotypes=set(gene_types))
# exported_genes = list(exported_genes['name'].values)
if
exported_genes
:
exported_genes
=
sorted
(
exported_genes
)
exported_genes
=
tuple
(
sorted
(
exported_genes
)
)
if
dumpto
:
with
open
(
dumpto
,
'wb'
)
as
fh
:
pickle
.
dump
((
features
,
exported_genes
),
fh
)
...
...
celseq2/template/config.yaml
View file @
51c02b7d
## CEL-seq2 Tech Setting ##
####################################
## Tech Setting
####################################
BC_INDEX_FPATH
:
'
/absolute/path/to/wonderful_umi.tab'
BC_SEQ_COLUMN
:
1
BC_SEQ_COLUMN
:
0
BC_IDs_DEFAULT
:
'
1-96'
UMI_START_POSITION
:
0
UMI_LENGTH
:
6
BC_START_POSITION
:
6
BC_LENGTH
:
6
## Tools ##
# 'bowtie2' 'star'
####################################
## Alignment Tools
####################################
## Which RNA-seq aligner to use? 'bowtie2' or 'star'
ALIGNER
:
'
bowtie2'
BOWTIE2_INDEX_PREFIX
:
'
/
absolute
/
path
/
to
/bowtie2_index'
## What is the
absolute
path
to
command bowtie2?
BOWTIE2
:
'
/absolute/path/to/bowtie2'
STAR_INDEX_DIR
:
'
/absolute/path/to/star/folder'
## What is the sharef prefix of bowtie2 index?
BOWTIE2_INDEX_PREFIX
:
'
/absolute/path/to/bowtie2_index'
## What is the absolute path to command STAR?
STAR
:
'
/absolute/path/to/star'
## Whare is the directory to save STAR index?
STAR_INDEX_DIR
:
'
/absolute/path/to/star/folder/'
####################################
## Annotations ##
# Path to GTF/GFF file
####################################
## Where is the GTF.gz/GFF.gz/GTF/GFF file?
GFF
:
'
/absolute/path/to/wonderful.gtf.gz'
# Refer: http://htseq.readthedocs.io/en/master/count.html
## Refer: http://htseq.readthedocs.io/en/master/count.html
## What is considered the "gene"? 'gene_name' or 'gene_id'?
FEATURE_ID
:
'
gene_name'
## What is the content of the "gene"? Mostly 'exon'.
FEATURE_CONTENT
:
'
exon'
#
GENE_BIOTYPE: ['protein_coding', 'lincRNA']
#
#
Which type of genes to report?
GENE_BIOTYPE
:
-
'
protein_coding'
-
'
lincRNA'
## If nothing set as below, all genes are reported.
## GENE_BIOTYPE:
## Demultiplexing ##
####################################
## Demultiplexing
####################################
FASTQ_QUAL_MIN_OF_BC
:
10
CUT_LENGTH
:
35
SAVE_UNKNOWN_BC_FASTQ
:
false
##
Alignment
##
##
UMI Count
##
##
################################
##
## UMI Count
##
################################
##
ALN_QUAL_MIN
:
0
## Running Parameters ##
####################################
## Running Parameters
####################################
num_threads
:
15
verbose
:
True
celseq2/version.py
View file @
51c02b7d
__version__
=
'0.4.
7
'
__version__
=
'0.4.
8
'
celseq2/workflow/celseq2.snakemake
View file @
51c02b7d
...
...
@@ -18,6 +18,7 @@ import tempfile
import shutil
import os
## Inforamtion ##
# '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2'
DIR_PROJ = config.get('output_dir', None)
...
...
@@ -53,6 +54,7 @@ if not GENE_BIOTYPE: GENE_BIOTYPE = ();
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10
CUT_LENGTH = config.get('CUT_LENGTH', None) # 35
SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False) # False
## Alignment ##
ALIGNER = config.get('ALIGNER', None) # 'bowtie2', 'star'
assert (ALIGNER), 'Error: Specify aligner.'
...
...
@@ -65,8 +67,8 @@ STRANDED = config.get('stranded', 'yes')
## Running Parameters ##
# bc_ids_used=config.get('bc_ids_used', None) # '1,2,3,4-5'
num_threads = config.get('num_threads',
None
) # 5
verbose = config.get('verbose',
Non
e) # True
num_threads = config.get('num_threads',
16
) # 5
verbose = config.get('verbose',
Tru
e) # True
#######################
...
...
@@ -107,10 +109,10 @@ SUBDIRS = [SUBDIR_INPUT,
SUBDIR_LOG, SUBDIR_QSUB, SUBDIR_DIAG, SUBDIR_ANNO
]
aln_diagnose_item = ["_unmapped",
"_low_map_qual", '_multimapped', "_uniquemapped",
"_no_feature", "_ambiguous",
"_total"]
#
aln_diagnose_item = ["_unmapped",
#
"_low_map_qual", '_multimapped', "_uniquemapped",
#
"_no_feature", "_ambiguous",
#
"_total"]
## Dev ##
#####################
...
...
@@ -121,39 +123,16 @@ workdir: DIR_PROJ
rule all:
input:
'_done_annotation',
'_done_UMI',
'_done_report',
'_done_annotation',
'_done_umimatrix_per_experiment'
,
output:
touch('_DONE')
,
run:
if glob.glob('celseq2_job*.sh*'):
mkfolder(SUBDIR_QSUB)
shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB))
rule COUNT_MATRIX:
input:
csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'),
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
output:
touch('_done_UMI')
message: 'Finished counting UMI-count matrix.'
run:
if ALIGNER == 'star':
shell('rm {}'.format('_done_star_genome_loaded'))
print('Free memory loaded by STAR', flush=True)
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = 'STAR '
cmd += '--genomeLoad Remove '
cmd += '--genomeDir {STAR_INDEX_DIR} '
cmd += '--outFileNamePrefix '
cmd += join_path(tmpdirname, '')
shell(cmd)
print_logger('UMI-count matrix is saved at {}'.format(input.csv))
'''
rule celseq2:
message: 'Finished Entire Pipeline.'
...
...
@@ -201,112 +180,87 @@ rule celseq2:
print_logger('Expression UMI matrix is saved at {}'.format(input.csv))
'''
rule setup_dir:
input: SAMPLE_TABLE_FPATH
output:
touch('_done_setupdir'),
dir1 = SUBDIRS,
dir2 = expand(join_path('{subdir}', '{itemid}'),
subdir=[SUBDIR_INPUT,
SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_DIAG,
SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_LOG],
itemid=item_names),
dir3 = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', '{itemid}'),
zip, expid=sample_list, itemid=item_names),
message: 'Setting up project directory.'
run:
for d in output.dir1:
mkfolder(d)
for d in output.dir2:
mkfolder(d)
for d in output.dir3:
mkfolder(d)
#
rule setup_dir:
#
input: SAMPLE_TABLE_FPATH
#
output:
#
touch('_done_setupdir'),
#
dir1 = SUBDIRS,
#
dir2 = expand(join_path('{subdir}', '{itemid}'),
#
subdir=[SUBDIR_INPUT,
#
SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_DIAG,
#
SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_LOG],
#
itemid=item_names),
#
dir3 = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', '{itemid}'),
#
zip, expid=sample_list, itemid=item_names),
#
message: 'Setting up project directory.'
#
run:
#
for d in output.dir1:
#
mkfolder(d)
#
for d in output.dir2:
#
mkfolder(d)
#
for d in output.dir3:
#
mkfolder(d)
## HT-seq Count UMI ##
rule CO
OK_ANNOTATION
:
rule CO
UNT_MATRIX
:
input:
'_done_setupdir',
csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'),
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
output:
anno = join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.pickle'),
flag = '_done_annotation',
message: 'Cooking Annotation'
touch('_done_UMI')
message: 'Finished counting UMI-count matrix.'
run:
_ = cook_anno_model(GFF, feature_atrr=FEATURE_ID,
feature_type=FEATURE_CONTENT,
gene_types=GENE_BIOTYPE,
stranded=True,
dumpto=output.anno,
verbose=verbose)
shell('touch {output.flag}')
if ALIGNER == 'star':
shell('rm {}'.format('_done_star_genome_loaded'))
print('Free memory loaded by STAR', flush=True)
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = 'STAR '
cmd += '--genomeLoad Remove '
cmd += '--genomeDir {STAR_INDEX_DIR} '
cmd += '--outFileNamePrefix '
cmd += join_path(tmpdirname, '')
shell(cmd)
print_logger('UMI-count matrix is saved at {}'.format(input.csv))
# Combo-demultiplexing
rule COMBO_DEMULTIPLEXING:
rule REPORT_ALIGNMENT_LOG:
input:
flag2 = '_done_annotation',
flag1 = '_done_setupdir',
report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}',
'alignment_'+ALIGNER+'.csv'),
itemid=item_names),
output:
'_done_combodemultiplex', # dynamic() is not allowed if task being called on terminal
message: 'Performing combo-demultiplexing'
threads: len(item_names)
run:
# Demultiplx fastq in Process pool
p = Pool(threads)
for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2):
itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid)
try:
os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz'))
os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz'))
except OSError:
pass
itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid)
itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid,
'demultiplexing.log')
print_logger('Demultiplexing {}'.format(itemid))
cmd = " ".join(["bc_demultiplex",
itemr1,
itemr2,
"--bc-index {}".format(BC_INDEX_FPATH),
"--bc-seq-column {}".format(BC_SEQ_COLUMN),
"--bc-index-used {}".format(itembc),
"--min-bc-quality {}".format(FASTQ_QUAL_MIN_OF_BC),
"--umi-start-position {}".format(
UMI_START_POSITION),
"--bc-start-position {}".format(BC_START_POSITION),
"--umi-length {}".format(UMI_LENGTH),
"--bc-length {}".format(BC_LENGTH),
"--cut-length {}".format(CUT_LENGTH),
"--out-dir {}".format(itemid_fqs_dir),
"--is-gzip ",
"--stats-file {}".format(itemid_log)])
p.apply_async(shell, args=(cmd,))
p.close()
p.join()
shell('touch {output} ')
touch('_done_report')
# Combo-demultiplexing
rule combo_demultiplexing:
input:
flag2 = '_done_annotation',
flag1 = '_done_setupdir',
input:
SAMPLE_TABLE_FPATH,
#
flag2 = '_done_annotation',
#
flag1 = '_done_setupdir',
output:
fq = dynamic(join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemid}', '{bc}.fastq')),
message: 'Performing combo-demultiplexing'
threads: len(item_names)
params:
jobs = len(item_names),
save_unknown_bc_fastq = SAVE_UNKNOWN_BC_FASTQ,
run:
# Demultiplx fastq in Process pool
p = Pool(
thread
s)
p = Pool(
params.job
s)
for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2):
itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid)
mkfolder(itemid_in)
try:
os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz'))
os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz'))
except OSError:
pass
itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid)
itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid,
'demultiplexing.log')
mkfolder(join_path(DIR_PROJ, SUBDIR_REPORT, itemid))
itemid_log = join_path(DIR_PROJ, SUBDIR_REPORT, itemid,
'demultiplexing.csv')
print_logger('Demultiplexing {}'.format(itemid))
cmd = " ".join(["bc_demultiplex",
itemr1,
...
...
@@ -324,6 +278,9 @@ rule combo_demultiplexing:
"--out-dir {}".format(itemid_fqs_dir),
"--is-gzip ",
"--stats-file {}".format(itemid_log)])
if params.save_unknown_bc_fastq:
cmd += ' --save-unknown-bc-fastq '
p.apply_async(shell, args=(cmd,))
p.close()
p.join()
...
...
@@ -427,11 +384,52 @@ if ALIGNER == 'star':
pickle.dump(df, open(output.report, 'wb'))