Commit 70f61aa5 authored by yy1533's avatar yy1533
Browse files

🐾 on 0.4.7 : dummy species contains biotype of gene

parent 348a3d94
......@@ -11,7 +11,7 @@ import random
import argparse
def gtf_attr_str(gene_id=None, gene_name=None,
def gtf_attr_str(gene_id=None, gene_name=None, gene_biotype=None,
transcript_id=None, exon_num=None):
'''
Simple writer to fill 'attribute' column of GTF
......@@ -26,6 +26,8 @@ def gtf_attr_str(gene_id=None, gene_name=None,
return(None)
out = "gene_id \"{gid}\"; gene_name \"{gname}\";".format(gid=gene_id,
gname=gene_name)
if gene_biotype:
out += " gene_biotype \"{biotype}\";".format(biotype=gene_biotype)
if transcript_id:
out += " transcript_id \"{txid}\";".format(txid=transcript_id)
if exon_num:
......@@ -62,7 +64,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
frame = '.'
out = []
# g0
g_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0')
g_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
......@@ -73,7 +75,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0',
tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding',
transcript_id='tx0')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -86,7 +88,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0',
exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding',
transcript_id='tx0', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -103,7 +105,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream, e_stream = e_stream - 2 * len_exon - 1 * len_intron + 1, e_stream
# g8
g_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8')
g_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8', gene_biotype='lincRNA')
g = gtf_str(chrm='chr2', src=src,
feature='gene',
start=s_stream,
......@@ -115,7 +117,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8',
transcript_id='tx8')
transcript_id='tx8', gene_biotype='lincRNA')
tx = gtf_str(chrm='chr2', src=src,
feature='transcript',
start=s_stream,
......@@ -127,7 +129,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8',
exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8', gene_biotype='lincRNA',
transcript_id='tx8', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr2', src=src,
......@@ -144,7 +146,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream = e_stream + 1 + len_intergenic
# g1
g_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1')
g_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
......@@ -155,7 +157,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1',
tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA',
transcript_id='tx1')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -168,7 +170,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(3, 0, -1):
exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1',
exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA',
transcript_id='tx1', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -185,7 +187,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream = e_stream + 1 + len_intergenic
# g2
g_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2')
g_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
......@@ -196,7 +198,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2',
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.1')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -209,7 +211,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2, 0, -1):
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2',
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
transcript_id='tx2', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -224,7 +226,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
out.append(exon)
s_stream = e_stream + 1 + len_intron
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2',
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.2')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -236,7 +238,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=tx_attr)
fout.write('{}\n'.format(tx))
out.append(tx)
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2',
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.2', exon_num=1)
exon = gtf_str(chrm='chr1', src=src,
feature='exon',
......@@ -251,7 +253,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream = e_stream + 1 + len_intergenic
# g3
g_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3')
g_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
......@@ -262,7 +264,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3',
tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding',
transcript_id='tx3')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -275,7 +277,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3',
exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding',
transcript_id='tx3', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -293,7 +295,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream = e_stream + 1 + len_intergenic
# g4
g_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4')
g_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
......@@ -304,7 +306,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4',
tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding',
transcript_id='tx4')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -317,7 +319,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4',
exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding',
transcript_id='tx4', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -334,7 +336,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
# g5
s_stream, e_stream = e_stream - len_exon + 1, e_stream
g_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5')
g_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
......@@ -345,7 +347,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5',
tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding',
transcript_id='tx5')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -358,7 +360,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(1, 0, -1):
exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5',
exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding',
transcript_id='tx5', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -375,7 +377,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream = e_stream + 1 + len_intergenic
# g6
g_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6')
g_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
......@@ -386,7 +388,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6',
tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding',
transcript_id='tx6')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -398,7 +400,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=tx_attr)
fout.write('{}\n'.format(tx))
out.append(tx)
exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6',
exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding',
transcript_id='tx6', exon_num=1)
exon = gtf_str(chrm='chr1', src=src,
feature='exon',
......@@ -411,7 +413,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(exon))
out.append(exon)
# g7
g_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7')
g_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
......@@ -422,7 +424,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7',
tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding',
transcript_id='tx7')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -435,7 +437,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7',
exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding',
transcript_id='tx7', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......
......@@ -8,30 +8,54 @@ from celseq2.helper import print_logger
def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon',
gene_types = (),
stranded=True, dumpto=None, verbose=False):
'''
Prepare a feature model.
Output: (features, exported_genes) where:
- features: HTSeq.GenomicArrayOfSets()
- exported_genes: a sorted list
For example, feature_atrr = 'gene_name', feature_type = 'exon',
gene_types = ('protein_coding', 'lincRNA'):
- features: all exons ~ all gnames mapping and ready for counting
- exported_genes: only protein_coding and lincRNA gnames are visible
Quantification used the full genes but only the selected genes are reported.
'''
features = HTSeq.GenomicArrayOfSets("auto", stranded=stranded)
fh_gff = HTSeq.GFF_Reader(gff_fpath)
all_genes = set()
exported_genes = set()
i = 0
for gff in fh_gff:
if verbose and i % 100000 == 0:
print_logger('Processing {:,} lines of GFF...'.format(i))
i += 1
if gff.type == feature_type:
features[gff.iv] += gff.attr[feature_atrr].strip()
all_genes.add(gff.attr[feature_atrr].strip())
else:
pass
if gff.type != feature_type:
continue
features[gff.iv] += gff.attr[feature_atrr].strip()
if not feature_atrr.startswith('gene'):
exported_genes.add(gff.attr[feature_atrr].strip())
continue
if not gene_types:
exported_genes.add(gff.attr[feature_atrr].strip())
continue
if gff.attr.get('gene_type', None) in gene_types:
exported_genes.add(gff.attr[feature_atrr].strip())
# if i == 5000:
# break ## test mode
print_logger('Processed {:,} lines of GFF...'.format(i))
if exported_genes:
exported_genes = sorted(exported_genes)
if dumpto:
with open(dumpto, 'wb') as fh:
pickle.dump((features, all_genes), fh)
return((features, all_genes))
pickle.dump((features, exported_genes), fh)
return((features, exported_genes))
def main():
......@@ -41,12 +65,17 @@ def main():
help='File path to GFF')
parser.add_argument('--feature-atrr', type=str,
default='gene_id',
help=('Reserved word for feature_atrr in GFF to be '
'called a \'gene\'.'))
help=('Reserved word for feature_atrr in GTF/GFF to be'
' considered a Gene, e.g., \"gene_id\".'))
parser.add_argument('--feature-type', type=str,
default='exon',
help=('Reserved word for feature type in GFF to be '
'the components of \'gene\'.'))
help=('Reserved word for feature type in 3rd column'
' in GTF/GFF to be the components of \'Gene\','
' e.g., \"exon\".'))
parser.add_argument('--gene-types', type=tuple,
default=(),
help=('Reserved words for the attrbute \"gene_biotype\"'
' (Gene type), e.g., protein_coding, lincRNA'))
parser.add_argument('--strandless', dest='stranded', action='store_false')
parser.set_defaults(stranded=True)
parser.add_argument('--dumpto', type=str, metavar='FILENAME',
......@@ -59,6 +88,7 @@ def main():
_ = cook_anno_model(gff_fpath=args.gff_file,
feature_atrr=args.feature_atrr,
feature_type=args.feature_type,
gene_types=args.gene_types,
stranded=args.stranded,
dumpto=args.dumpto,
verbose=args.verbose)
......
......@@ -21,6 +21,8 @@ GFF: '/absolute/path/to/wonderful.gtf.gz'
# Refer: http://htseq.readthedocs.io/en/master/count.html
FEATURE_ID: 'gene_name'
FEATURE_CONTENT: 'exon'
# GENE_BIOTYPE: ['protein_coding', 'lincRNA']
GENE_BIOTYPE:
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC: 10
......
__version__ = '0.4.6'
__version__ = '0.4.7'
......@@ -47,6 +47,8 @@ STAR = config.get('STAR', None)
GFF = config.get('GFF', None)
FEATURE_ID = config.get('FEATURE_ID', 'gene_name')
FEATURE_CONTENT = config.get('FEATURE_CONTENT', 'exon')
GENE_BIOTYPE = config.get('GENE_BIOTYPE', None)
if not GENE_BIOTYPE: GENE_BIOTYPE = ();
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10
......@@ -222,7 +224,7 @@ rule setup_dir:
mkfolder(d)
## HT-seq Count UMI ##
rule cook_annotation:
rule COOK_ANNOTATION:
input:
'_done_setupdir',
output:
......@@ -233,12 +235,57 @@ rule cook_annotation:
run:
_ = cook_anno_model(GFF, feature_atrr=FEATURE_ID,
feature_type=FEATURE_CONTENT,
gene_types=GENE_BIOTYPE,
stranded=True,
dumpto=output.anno,
verbose=verbose)
shell('touch {output.flag}')
# Combo-demultiplexing
rule COMBO_DEMULTIPLEXING:
input:
flag2 = '_done_annotation',
flag1 = '_done_setupdir',
output:
'_done_combodemultiplex', # dynamic() is not allowed if task being called on terminal
message: 'Performing combo-demultiplexing'
threads: len(item_names)
run:
# Demultiplx fastq in Process pool
p = Pool(threads)
for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2):
itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid)
try:
os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz'))
os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz'))
except OSError:
pass
itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid)
itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid,
'demultiplexing.log')
print_logger('Demultiplexing {}'.format(itemid))
cmd = " ".join(["bc_demultiplex",
itemr1,
itemr2,
"--bc-index {}".format(BC_INDEX_FPATH),
"--bc-seq-column {}".format(BC_SEQ_COLUMN),
"--bc-index-used {}".format(itembc),
"--min-bc-quality {}".format(FASTQ_QUAL_MIN_OF_BC),
"--umi-start-position {}".format(
UMI_START_POSITION),
"--bc-start-position {}".format(BC_START_POSITION),
"--umi-length {}".format(UMI_LENGTH),
"--bc-length {}".format(BC_LENGTH),
"--cut-length {}".format(CUT_LENGTH),
"--out-dir {}".format(itemid_fqs_dir),
"--is-gzip ",
"--stats-file {}".format(itemid_log)])
p.apply_async(shell, args=(cmd,))
p.close()
p.join()
shell('touch {output} ')
rule combo_demultiplexing:
input:
flag2 = '_done_annotation',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment