Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
Holly Ruess
celseq2
Commits
70f61aa5
Commit
70f61aa5
authored
Mar 23, 2018
by
yy1533
Browse files
🐾
on 0.4.7 : dummy species contains biotype of gene
parent
348a3d94
Changes
5
Hide whitespace changes
Inline
Side-by-side
celseq2/dummy_species.py
View file @
70f61aa5
...
...
@@ -11,7 +11,7 @@ import random
import
argparse
def
gtf_attr_str
(
gene_id
=
None
,
gene_name
=
None
,
def
gtf_attr_str
(
gene_id
=
None
,
gene_name
=
None
,
gene_biotype
=
None
,
transcript_id
=
None
,
exon_num
=
None
):
'''
Simple writer to fill 'attribute' column of GTF
...
...
@@ -26,6 +26,8 @@ def gtf_attr_str(gene_id=None, gene_name=None,
return
(
None
)
out
=
"gene_id
\"
{gid}
\"
; gene_name
\"
{gname}
\"
;"
.
format
(
gid
=
gene_id
,
gname
=
gene_name
)
if
gene_biotype
:
out
+=
" gene_biotype
\"
{biotype}
\"
;"
.
format
(
biotype
=
gene_biotype
)
if
transcript_id
:
out
+=
" transcript_id
\"
{txid}
\"
;"
.
format
(
txid
=
transcript_id
)
if
exon_num
:
...
...
@@ -62,7 +64,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
frame
=
'.'
out
=
[]
# g0
g_attr
=
gtf_attr_str
(
gene_id
=
'g0'
,
gene_name
=
'celseq2_gene-0'
)
g_attr
=
gtf_attr_str
(
gene_id
=
'g0'
,
gene_name
=
'celseq2_gene-0'
,
gene_biotype
=
'protein_coding'
)
g
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'gene'
,
start
=
s_stream
,
...
...
@@ -73,7 +75,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g0'
,
gene_name
=
'celseq_gene-0'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g0'
,
gene_name
=
'celseq_gene-0'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx0'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -86,7 +88,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g0'
,
gene_name
=
'celseq_gene-0'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g0'
,
gene_name
=
'celseq_gene-0'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx0'
,
exon_num
=
i
+
1
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -103,7 +105,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream
,
e_stream
=
e_stream
-
2
*
len_exon
-
1
*
len_intron
+
1
,
e_stream
# g8
g_attr
=
gtf_attr_str
(
gene_id
=
'g8'
,
gene_name
=
'celseq2_gene-8'
)
g_attr
=
gtf_attr_str
(
gene_id
=
'g8'
,
gene_name
=
'celseq2_gene-8'
,
gene_biotype
=
'lincRNA'
)
g
=
gtf_str
(
chrm
=
'chr2'
,
src
=
src
,
feature
=
'gene'
,
start
=
s_stream
,
...
...
@@ -115,7 +117,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g8'
,
gene_name
=
'celseq_gene-8'
,
transcript_id
=
'tx8'
)
transcript_id
=
'tx8'
,
gene_biotype
=
'lincRNA'
)
tx
=
gtf_str
(
chrm
=
'chr2'
,
src
=
src
,
feature
=
'transcript'
,
start
=
s_stream
,
...
...
@@ -127,7 +129,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g8'
,
gene_name
=
'celseq_gene-8'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g8'
,
gene_name
=
'celseq_gene-8'
,
gene_biotype
=
'lincRNA'
,
transcript_id
=
'tx8'
,
exon_num
=
i
+
1
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr2'
,
src
=
src
,
...
...
@@ -144,7 +146,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream
=
e_stream
+
1
+
len_intergenic
# g1
g_attr
=
gtf_attr_str
(
gene_id
=
'g1'
,
gene_name
=
'celseq2_gene-1'
)
g_attr
=
gtf_attr_str
(
gene_id
=
'g1'
,
gene_name
=
'celseq2_gene-1'
,
gene_biotype
=
'miRNA'
)
g
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'gene'
,
start
=
s_stream
,
...
...
@@ -155,7 +157,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g1'
,
gene_name
=
'celseq_gene-1'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g1'
,
gene_name
=
'celseq_gene-1'
,
gene_biotype
=
'miRNA'
,
transcript_id
=
'tx1'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -168,7 +170,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
3
,
0
,
-
1
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g1'
,
gene_name
=
'celseq_gene-1'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g1'
,
gene_name
=
'celseq_gene-1'
,
gene_biotype
=
'miRNA'
,
transcript_id
=
'tx1'
,
exon_num
=
i
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -185,7 +187,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream
=
e_stream
+
1
+
len_intergenic
# g2
g_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq2_gene-2'
)
g_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq2_gene-2'
,
gene_biotype
=
'protein_coding'
)
g
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'gene'
,
start
=
s_stream
,
...
...
@@ -196,7 +198,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx2.1'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -209,7 +211,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
,
0
,
-
1
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx2'
,
exon_num
=
i
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -224,7 +226,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
out
.
append
(
exon
)
s_stream
=
e_stream
+
1
+
len_intron
tx_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx2.2'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -236,7 +238,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
tx_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
exon_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g2'
,
gene_name
=
'celseq_gene-2'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx2.2'
,
exon_num
=
1
)
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'exon'
,
...
...
@@ -251,7 +253,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream
=
e_stream
+
1
+
len_intergenic
# g3
g_attr
=
gtf_attr_str
(
gene_id
=
'g3'
,
gene_name
=
'celseq2_gene-3'
)
g_attr
=
gtf_attr_str
(
gene_id
=
'g3'
,
gene_name
=
'celseq2_gene-3'
,
gene_biotype
=
'protein_coding'
)
g
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'gene'
,
start
=
s_stream
,
...
...
@@ -262,7 +264,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g3'
,
gene_name
=
'celseq_gene-3'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g3'
,
gene_name
=
'celseq_gene-3'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx3'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -275,7 +277,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g3'
,
gene_name
=
'celseq_gene-3'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g3'
,
gene_name
=
'celseq_gene-3'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx3'
,
exon_num
=
i
+
1
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -293,7 +295,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream
=
e_stream
+
1
+
len_intergenic
# g4
g_attr
=
gtf_attr_str
(
gene_id
=
'g4'
,
gene_name
=
'celseq2_gene-4'
)
g_attr
=
gtf_attr_str
(
gene_id
=
'g4'
,
gene_name
=
'celseq2_gene-4'
,
gene_biotype
=
'protein_coding'
)
g
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'gene'
,
start
=
s_stream
,
...
...
@@ -304,7 +306,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g4'
,
gene_name
=
'celseq_gene-4'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g4'
,
gene_name
=
'celseq_gene-4'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx4'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -317,7 +319,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g4'
,
gene_name
=
'celseq_gene-4'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g4'
,
gene_name
=
'celseq_gene-4'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx4'
,
exon_num
=
i
+
1
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -334,7 +336,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
# g5
s_stream
,
e_stream
=
e_stream
-
len_exon
+
1
,
e_stream
g_attr
=
gtf_attr_str
(
gene_id
=
'g5'
,
gene_name
=
'celseq2_gene-5'
)
g_attr
=
gtf_attr_str
(
gene_id
=
'g5'
,
gene_name
=
'celseq2_gene-5'
,
gene_biotype
=
'protein_coding'
)
g
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'gene'
,
start
=
s_stream
,
...
...
@@ -345,7 +347,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g5'
,
gene_name
=
'celseq_gene-5'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g5'
,
gene_name
=
'celseq_gene-5'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx5'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -358,7 +360,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
1
,
0
,
-
1
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g5'
,
gene_name
=
'celseq_gene-5'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g5'
,
gene_name
=
'celseq_gene-5'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx5'
,
exon_num
=
i
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
@@ -375,7 +377,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream
=
e_stream
+
1
+
len_intergenic
# g6
g_attr
=
gtf_attr_str
(
gene_id
=
'g6'
,
gene_name
=
'celseq2_gene-6'
)
g_attr
=
gtf_attr_str
(
gene_id
=
'g6'
,
gene_name
=
'celseq2_gene-6'
,
gene_biotype
=
'protein_coding'
)
g
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'gene'
,
start
=
s_stream
,
...
...
@@ -386,7 +388,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g6'
,
gene_name
=
'celseq_gene-6'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g6'
,
gene_name
=
'celseq_gene-6'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx6'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -398,7 +400,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
tx_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
exon_attr
=
gtf_attr_str
(
gene_id
=
'g6'
,
gene_name
=
'celseq_gene-6'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g6'
,
gene_name
=
'celseq_gene-6'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx6'
,
exon_num
=
1
)
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'exon'
,
...
...
@@ -411,7 +413,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
exon
))
out
.
append
(
exon
)
# g7
g_attr
=
gtf_attr_str
(
gene_id
=
'g7'
,
gene_name
=
'celseq2_gene-7'
)
g_attr
=
gtf_attr_str
(
gene_id
=
'g7'
,
gene_name
=
'celseq2_gene-7'
,
gene_biotype
=
'protein_coding'
)
g
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'gene'
,
start
=
s_stream
,
...
...
@@ -422,7 +424,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr
=
g_attr
)
fout
.
write
(
'{}
\n
'
.
format
(
g
))
out
.
append
(
g
)
tx_attr
=
gtf_attr_str
(
gene_id
=
'g7'
,
gene_name
=
'celseq_gene-7'
,
tx_attr
=
gtf_attr_str
(
gene_id
=
'g7'
,
gene_name
=
'celseq_gene-7'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx7'
)
tx
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
feature
=
'transcript'
,
...
...
@@ -435,7 +437,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout
.
write
(
'{}
\n
'
.
format
(
tx
))
out
.
append
(
tx
)
for
i
in
range
(
2
):
exon_attr
=
gtf_attr_str
(
gene_id
=
'g7'
,
gene_name
=
'celseq_gene-7'
,
exon_attr
=
gtf_attr_str
(
gene_id
=
'g7'
,
gene_name
=
'celseq_gene-7'
,
gene_biotype
=
'protein_coding'
,
transcript_id
=
'tx7'
,
exon_num
=
i
+
1
)
s_stream
,
e_stream
=
s_stream
,
s_stream
+
len_exon
-
1
exon
=
gtf_str
(
chrm
=
'chr1'
,
src
=
src
,
...
...
celseq2/prepare_annotation_model.py
View file @
70f61aa5
...
...
@@ -8,30 +8,54 @@ from celseq2.helper import print_logger
def
cook_anno_model
(
gff_fpath
,
feature_atrr
=
'gene_id'
,
feature_type
=
'exon'
,
gene_types
=
(),
stranded
=
True
,
dumpto
=
None
,
verbose
=
False
):
'''
Prepare a feature model.
Output: (features, exported_genes) where:
- features: HTSeq.GenomicArrayOfSets()
- exported_genes: a sorted list
For example, feature_atrr = 'gene_name', feature_type = 'exon',
gene_types = ('protein_coding', 'lincRNA'):
- features: all exons ~ all gnames mapping and ready for counting
- exported_genes: only protein_coding and lincRNA gnames are visible
Quantification used the full genes but only the selected genes are reported.
'''
features
=
HTSeq
.
GenomicArrayOfSets
(
"auto"
,
stranded
=
stranded
)
fh_gff
=
HTSeq
.
GFF_Reader
(
gff_fpath
)
all
_genes
=
set
()
exported
_genes
=
set
()
i
=
0
for
gff
in
fh_gff
:
if
verbose
and
i
%
100000
==
0
:
print_logger
(
'Processing {:,} lines of GFF...'
.
format
(
i
))
i
+=
1
if
gff
.
type
==
feature_type
:
features
[
gff
.
iv
]
+=
gff
.
attr
[
feature_atrr
].
strip
()
all_genes
.
add
(
gff
.
attr
[
feature_atrr
].
strip
())
else
:
pass
if
gff
.
type
!=
feature_type
:
continue
features
[
gff
.
iv
]
+=
gff
.
attr
[
feature_atrr
].
strip
()
if
not
feature_atrr
.
startswith
(
'gene'
):
exported_genes
.
add
(
gff
.
attr
[
feature_atrr
].
strip
())
continue
if
not
gene_types
:
exported_genes
.
add
(
gff
.
attr
[
feature_atrr
].
strip
())
continue
if
gff
.
attr
.
get
(
'gene_type'
,
None
)
in
gene_types
:
exported_genes
.
add
(
gff
.
attr
[
feature_atrr
].
strip
())
# if i == 5000:
# break ## test mode
print_logger
(
'Processed {:,} lines of GFF...'
.
format
(
i
))
if
exported_genes
:
exported_genes
=
sorted
(
exported_genes
)
if
dumpto
:
with
open
(
dumpto
,
'wb'
)
as
fh
:
pickle
.
dump
((
features
,
all
_genes
),
fh
)
return
((
features
,
all
_genes
))
pickle
.
dump
((
features
,
exported
_genes
),
fh
)
return
((
features
,
exported
_genes
))
def
main
():
...
...
@@ -41,12 +65,17 @@ def main():
help
=
'File path to GFF'
)
parser
.
add_argument
(
'--feature-atrr'
,
type
=
str
,
default
=
'gene_id'
,
help
=
(
'Reserved word for feature_atrr in GFF to be
'
'
called a
\'
gene
\'
.'
))
help
=
(
'Reserved word for feature_atrr in
GTF/
GFF to be'
'
considered a Gene, e.g.,
\"
gene_id
\"
.'
))
parser
.
add_argument
(
'--feature-type'
,
type
=
str
,
default
=
'exon'
,
help
=
(
'Reserved word for feature type in GFF to be '
'the components of
\'
gene
\'
.'
))
help
=
(
'Reserved word for feature type in 3rd column'
' in GTF/GFF to be the components of
\'
Gene
\'
,'
' e.g.,
\"
exon
\"
.'
))
parser
.
add_argument
(
'--gene-types'
,
type
=
tuple
,
default
=
(),
help
=
(
'Reserved words for the attrbute
\"
gene_biotype
\"
'
' (Gene type), e.g., protein_coding, lincRNA'
))
parser
.
add_argument
(
'--strandless'
,
dest
=
'stranded'
,
action
=
'store_false'
)
parser
.
set_defaults
(
stranded
=
True
)
parser
.
add_argument
(
'--dumpto'
,
type
=
str
,
metavar
=
'FILENAME'
,
...
...
@@ -59,6 +88,7 @@ def main():
_
=
cook_anno_model
(
gff_fpath
=
args
.
gff_file
,
feature_atrr
=
args
.
feature_atrr
,
feature_type
=
args
.
feature_type
,
gene_types
=
args
.
gene_types
,
stranded
=
args
.
stranded
,
dumpto
=
args
.
dumpto
,
verbose
=
args
.
verbose
)
...
...
celseq2/template/config.yaml
View file @
70f61aa5
...
...
@@ -21,6 +21,8 @@ GFF: '/absolute/path/to/wonderful.gtf.gz'
# Refer: http://htseq.readthedocs.io/en/master/count.html
FEATURE_ID
:
'
gene_name'
FEATURE_CONTENT
:
'
exon'
# GENE_BIOTYPE: ['protein_coding', 'lincRNA']
GENE_BIOTYPE
:
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC
:
10
...
...
celseq2/version.py
View file @
70f61aa5
__version__
=
'0.4.
6
'
__version__
=
'0.4.
7
'
celseq2/workflow/celseq2.snakemake
View file @
70f61aa5
...
...
@@ -47,6 +47,8 @@ STAR = config.get('STAR', None)
GFF = config.get('GFF', None)
FEATURE_ID = config.get('FEATURE_ID', 'gene_name')
FEATURE_CONTENT = config.get('FEATURE_CONTENT', 'exon')
GENE_BIOTYPE = config.get('GENE_BIOTYPE', None)
if not GENE_BIOTYPE: GENE_BIOTYPE = ();
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10
...
...
@@ -222,7 +224,7 @@ rule setup_dir:
mkfolder(d)
## HT-seq Count UMI ##
rule
cook_annotation
:
rule
COOK_ANNOTATION
:
input:
'_done_setupdir',
output:
...
...
@@ -233,12 +235,57 @@ rule cook_annotation:
run:
_ = cook_anno_model(GFF, feature_atrr=FEATURE_ID,
feature_type=FEATURE_CONTENT,
gene_types=GENE_BIOTYPE,
stranded=True,
dumpto=output.anno,
verbose=verbose)
shell('touch {output.flag}')
# Combo-demultiplexing
rule COMBO_DEMULTIPLEXING:
input:
flag2 = '_done_annotation',
flag1 = '_done_setupdir',
output:
'_done_combodemultiplex', # dynamic() is not allowed if task being called on terminal
message: 'Performing combo-demultiplexing'
threads: len(item_names)
run:
# Demultiplx fastq in Process pool
p = Pool(threads)
for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2):
itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid)
try:
os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz'))
os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz'))
except OSError:
pass
itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid)
itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid,
'demultiplexing.log')
print_logger('Demultiplexing {}'.format(itemid))
cmd = " ".join(["bc_demultiplex",
itemr1,
itemr2,
"--bc-index {}".format(BC_INDEX_FPATH),
"--bc-seq-column {}".format(BC_SEQ_COLUMN),
"--bc-index-used {}".format(itembc),
"--min-bc-quality {}".format(FASTQ_QUAL_MIN_OF_BC),
"--umi-start-position {}".format(
UMI_START_POSITION),
"--bc-start-position {}".format(BC_START_POSITION),
"--umi-length {}".format(UMI_LENGTH),
"--bc-length {}".format(BC_LENGTH),
"--cut-length {}".format(CUT_LENGTH),
"--out-dir {}".format(itemid_fqs_dir),
"--is-gzip ",
"--stats-file {}".format(itemid_log)])
p.apply_async(shell, args=(cmd,))
p.close()
p.join()
shell('touch {output} ')
rule combo_demultiplexing:
input:
flag2 = '_done_annotation',
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment