Commit 44114efa authored by yy1533's avatar yy1533
Browse files

🇨🇳 0.4.8 stable design of workflow; compatible to indrop pipeline

parent dde40ffa
......@@ -184,20 +184,20 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
def write_demultiplexing(stats, dict_bc_id2seq, stats_fpath):
if stats_fpath is None:
stats_fpath = 'demultiplexing.log'
stats_fpath = 'demultiplexing.csv'
try:
fh_stats = open(stats_fpath, 'w')
except Exception as e:
raise Exception(e)
fh_stats.write('BC\tReads(#)\tReads(%)\n')
fh_stats.write('BC,Reads(#),Reads(%)\n')
for bc_id, bc_seq in dict_bc_id2seq.items():
# bc_id = '[{:04d}]'.format('-'.join(map(str, bc_id)))
formatter = '{:04d}-{}\t{}\t{:07.3f}\n'
formatter = '{:04d}-{},{},{:07.3f}\n'
fh_stats.write(formatter.format(bc_id, bc_seq, stats[bc_seq],
stats[bc_seq] / stats['total'] * 100))
formatter = '{}\t{}\t{:07.3f}\n'
formatter = '{},{},{:07.3f}\n'
fh_stats.write(formatter.format('saved', stats['saved'],
stats['saved'] / stats['total'] * 100))
fh_stats.write(formatter.format('unknown', stats['unknown'],
......
......@@ -75,7 +75,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding',
transcript_id='tx0')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -88,7 +88,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding',
transcript_id='tx0', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -116,7 +116,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8',
tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8',
transcript_id='tx8', gene_biotype='lincRNA')
tx = gtf_str(chrm='chr2', src=src,
feature='transcript',
......@@ -129,7 +129,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8', gene_biotype='lincRNA',
exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8', gene_biotype='lincRNA',
transcript_id='tx8', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr2', src=src,
......@@ -157,7 +157,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA',
tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA',
transcript_id='tx1')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -170,7 +170,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(3, 0, -1):
exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA',
exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA',
transcript_id='tx1', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -198,7 +198,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.1')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -211,7 +211,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2, 0, -1):
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -226,7 +226,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
out.append(exon)
s_stream = e_stream + 1 + len_intron
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.2')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -238,7 +238,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=tx_attr)
fout.write('{}\n'.format(tx))
out.append(tx)
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.2', exon_num=1)
exon = gtf_str(chrm='chr1', src=src,
feature='exon',
......@@ -264,7 +264,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding',
transcript_id='tx3')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -277,7 +277,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding',
transcript_id='tx3', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -306,7 +306,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding',
transcript_id='tx4')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -319,7 +319,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding',
transcript_id='tx4', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -347,7 +347,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding',
transcript_id='tx5')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -360,7 +360,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(1, 0, -1):
exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding',
transcript_id='tx5', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -388,7 +388,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding',
transcript_id='tx6')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -400,7 +400,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=tx_attr)
fout.write('{}\n'.format(tx))
out.append(tx)
exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding',
transcript_id='tx6', exon_num=1)
exon = gtf_str(chrm='chr1', src=src,
feature='exon',
......@@ -424,7 +424,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding',
transcript_id='tx7')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -437,7 +437,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding',
transcript_id='tx7', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......
......@@ -35,7 +35,7 @@ def ymdhms():
def print_logger(msg):
localtime = time.asctime(time.localtime(time.time()))
# sys.stderr.write("[ {} ] {}\n".format(localtime, msg))
print("[ {} ] {}\n".format(localtime, msg))
print("[ {} ] {}\n".format(localtime, msg), flush=True)
def mkfolder(dirpath):
......
......@@ -5,7 +5,7 @@ import argparse
import pickle
from celseq2.helper import print_logger
from genometools.ensembl.annotations import get_genes
# from genometools.ensembl.annotations import get_genes
def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon',
......@@ -46,15 +46,16 @@ def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon',
exported_genes.add(gff.attr[feature_atrr].strip())
continue
# if gff.attr.get('gene_biotype', None) in gene_types:
# exported_genes.add(gff.attr[feature_atrr].strip())
if gff.attr.get('gene_biotype', None) in gene_types:
exported_genes.add(gff.attr[feature_atrr].strip())
print_logger('Processed {:,} lines of GFF...'.format(i))
# Use genometools to select exported_genes
if gene_types:
exported_genes = get_genes(gff_fpath, valid_biotypes=set(gene_types))
exported_genes = list(exported_genes['name'].values)
# if gene_types:
# exported_genes = get_genes(gff_fpath, valid_biotypes=set(gene_types))
# exported_genes = list(exported_genes['name'].values)
if exported_genes:
exported_genes = tuple(sorted(exported_genes))
if dumpto:
......
......@@ -18,6 +18,7 @@ import tempfile
import shutil
import os
## Inforamtion ##
# '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2'
DIR_PROJ = config.get('output_dir', None)
......@@ -107,10 +108,10 @@ SUBDIRS = [SUBDIR_INPUT,
SUBDIR_LOG, SUBDIR_QSUB, SUBDIR_DIAG, SUBDIR_ANNO
]
aln_diagnose_item = ["_unmapped",
"_low_map_qual", '_multimapped', "_uniquemapped",
"_no_feature", "_ambiguous",
"_total"]
# aln_diagnose_item = ["_unmapped",
# "_low_map_qual", '_multimapped', "_uniquemapped",
# "_no_feature", "_ambiguous",
# "_total"]
## Dev ##
#####################
......@@ -121,39 +122,13 @@ workdir: DIR_PROJ
rule all:
input:
'_done_annotation',
'_done_UMI',
'_done_report',
'_done_annotation',
'_done_umimatrix_per_experiment',
run:
if glob.glob('celseq2_job*.sh*'):
shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB))
rule COUNT_MATRIX:
input:
csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'),
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
output:
touch('_done_UMI')
message: 'Finished counting UMI-count matrix.'
run:
if ALIGNER == 'star':
shell('rm {}'.format('_done_star_genome_loaded'))
print('Free memory loaded by STAR', flush=True)
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = 'STAR '
cmd += '--genomeLoad Remove '
cmd += '--genomeDir {STAR_INDEX_DIR} '
cmd += '--outFileNamePrefix '
cmd += join_path(tmpdirname, '')
shell(cmd)
print_logger('UMI-count matrix is saved at {}'.format(input.csv))
'''
rule celseq2:
message: 'Finished Entire Pipeline.'
......@@ -201,95 +176,103 @@ rule celseq2:
print_logger('Expression UMI matrix is saved at {}'.format(input.csv))
'''
rule setup_dir:
input: SAMPLE_TABLE_FPATH
output:
touch('_done_setupdir'),
dir1 = SUBDIRS,
dir2 = expand(join_path('{subdir}', '{itemid}'),
subdir=[SUBDIR_INPUT,
SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_DIAG,
SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_LOG],
itemid=item_names),
dir3 = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', '{itemid}'),
zip, expid=sample_list, itemid=item_names),
message: 'Setting up project directory.'
run:
for d in output.dir1:
mkfolder(d)
for d in output.dir2:
mkfolder(d)
for d in output.dir3:
mkfolder(d)
# rule setup_dir:
# input: SAMPLE_TABLE_FPATH
# output:
# touch('_done_setupdir'),
# dir1 = SUBDIRS,
# dir2 = expand(join_path('{subdir}', '{itemid}'),
# subdir=[SUBDIR_INPUT,
# SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_DIAG,
# SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_LOG],
# itemid=item_names),
# dir3 = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', '{itemid}'),
# zip, expid=sample_list, itemid=item_names),
# message: 'Setting up project directory.'
# run:
# for d in output.dir1:
# mkfolder(d)
# for d in output.dir2:
# mkfolder(d)
# for d in output.dir3:
# mkfolder(d)
## HT-seq Count UMI ##
rule COOK_ANNOTATION:
input:
'_done_setupdir',
SAMPLE_TABLE_FPATH,
output:
anno = join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.pickle'),
anno_pkl = join_path(DIR_PROJ, SUBDIR_ANNO, base_name(GFF) + '.pickle'),
anno_csv = join_path(DIR_PROJ, SUBDIR_ANNO, base_name(GFF) + '.csv'),
flag = '_done_annotation',
params:
gene_type = GENE_BIOTYPE
priority: 100
message: 'Cooking Annotation'
run:
_ = cook_anno_model(GFF, feature_atrr=FEATURE_ID,
feature_type=FEATURE_CONTENT,
gene_types=GENE_BIOTYPE,
stranded=True,
dumpto=output.anno,
verbose=verbose)
from genometools.ensembl.annotations import get_genes
features, exported_genes = cook_anno_model(
GFF, feature_atrr=FEATURE_ID,
feature_type=FEATURE_CONTENT,
gene_types=[], # defautl to all genes
stranded=True,
dumpto=None, # manual export
verbose=verbose)
if params.gene_type:
print_logger('Types of reported gene: {}.'.format(
', '.join(params.gene_type)))
gene_df = get_genes(GFF, valid_biotypes = set(params.gene_type))
exported_genes = sorted(gene_df['name'].values)
print_logger('Number of reported genes: {}.'.format(
len(exported_genes)))
gene_df.to_csv(output.anno_csv)
else:
shell('touch {output.anno_csv}')
with open(output.anno_pkl, 'wb') as fh:
pickle.dump( (features, exported_genes), fh)
shell('touch {output.flag}')
# Combo-demultiplexing
rule COMBO_DEMULTIPLEXING:
rule COUNT_MATRIX:
input:
# flag2 = '_done_annotation',
flag1 = '_done_setupdir',
csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'),
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
output:
'_done_combodemultiplex', # dynamic() is not allowed if task being called on terminal
message: 'Performing combo-demultiplexing'
threads: len(item_names)
touch('_done_UMI')
message: 'Finished counting UMI-count matrix.'
run:
# Demultiplx fastq in Process pool
p = Pool(threads)
for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2):
itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid)
try:
os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz'))
os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz'))
except OSError:
pass
itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid)
itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid,
'demultiplexing.log')
print_logger('Demultiplexing {}'.format(itemid))
cmd = " ".join(["bc_demultiplex",
itemr1,
itemr2,
"--bc-index {}".format(BC_INDEX_FPATH),
"--bc-seq-column {}".format(BC_SEQ_COLUMN),
"--bc-index-used {}".format(itembc),
"--min-bc-quality {}".format(FASTQ_QUAL_MIN_OF_BC),
"--umi-start-position {}".format(
UMI_START_POSITION),
"--bc-start-position {}".format(BC_START_POSITION),
"--umi-length {}".format(UMI_LENGTH),
"--bc-length {}".format(BC_LENGTH),
"--cut-length {}".format(CUT_LENGTH),
"--out-dir {}".format(itemid_fqs_dir),
"--is-gzip ",
"--stats-file {}".format(itemid_log)])
p.apply_async(shell, args=(cmd,))
p.close()
p.join()
shell('touch {output} ')
if ALIGNER == 'star':
shell('rm {}'.format('_done_star_genome_loaded'))
print('Free memory loaded by STAR', flush=True)
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = 'STAR '
cmd += '--genomeLoad Remove '
cmd += '--genomeDir {STAR_INDEX_DIR} '
cmd += '--outFileNamePrefix '
cmd += join_path(tmpdirname, '')
shell(cmd)
print_logger('UMI-count matrix is saved at {}'.format(input.csv))
rule REPORT_ALIGNMENT_LOG:
input:
report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}',
'alignment_'+ALIGNER+'.csv'),
itemid=item_names),
output:
touch('_done_report')
# Combo-demultiplexing
rule combo_demultiplexing:
input:
input: SAMPLE_TABLE_FPATH,
# flag2 = '_done_annotation',
flag1 = '_done_setupdir',
# flag1 = '_done_setupdir',
output:
fq = dynamic(join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemid}', '{bc}.fastq')),
message: 'Performing combo-demultiplexing'
......@@ -299,14 +282,17 @@ rule combo_demultiplexing:
p = Pool(threads)
for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2):
itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid)
mkfolder(itemid_in)
try:
os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz'))
os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz'))
except OSError:
pass
itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid)
itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid,
'demultiplexing.log')
mkfolder(join_path(DIR_PROJ, SUBDIR_REPORT, itemid))
itemid_log = join_path(DIR_PROJ, SUBDIR_REPORT, itemid,
'demultiplexing.csv')
print_logger('Demultiplexing {}'.format(itemid))
cmd = " ".join(["bc_demultiplex",
itemr1,
......@@ -429,9 +415,10 @@ if ALIGNER == 'star':
rule count_umi:
input:
gff = join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.pickle'),
flag = '_done_annotation',
# gff = join_path(DIR_PROJ, SUBDIR_ANNO,
# base_name(GFF) + '.pickle'),
# flag = '_done_annotation',
gff = rules.COOK_ANNOTATION.output.anno_pkl,
sam = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemid}', '{bc}.sam'),
output:
umicnt = join_path(DIR_PROJ, SUBDIR_UMI_CNT, '{itemid}', '{bc}.pkl'),
......@@ -454,8 +441,9 @@ rule count_umi:
# - regular umi-count using *_umicnt.pkl -> umi_count_matrix replicates/lanes per plate
rule summarize_umi_matrix_per_item:
input:
gff = join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.pickle'),
# gff = join_path(DIR_PROJ, SUBDIR_ANNO,
# base_name(GFF) + '.pickle'),
gff = rules.COOK_ANNOTATION.output.anno_pkl,
umicnt = dynamic(join_path(DIR_PROJ, SUBDIR_UMI_CNT,
'{itemid}', '{bc}.pkl')),
output:
......@@ -497,8 +485,9 @@ rule summarize_umi_matrix_per_item:
# - merge umi-count using *_umiset.pkl -> correct umi count per experiment/plate
rule summarize_umi_matrix_per_experiment:
input:
gff = join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.pickle'),
# gff = join_path(DIR_PROJ, SUBDIR_ANNO,
# base_name(GFF) + '.pickle'),
gff = rules.COOK_ANNOTATION.output.anno_pkl,
umiset = dynamic(join_path(DIR_PROJ, SUBDIR_UMI_SET,
'{itemid}', '{bc}.pkl')),
output:
......@@ -548,12 +537,6 @@ rule summarize_umi_matrix_per_experiment:
shell('touch {output.flag} ')
rule REPORT_ALIGNMENT_LOG:
input:
report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}',
'alignment_'+ALIGNER+'.csv'),
itemid=item_names),
rule report_alignment_log:
input:
df = dynamic(join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}', ALIGNER,
......@@ -561,7 +544,6 @@ rule report_alignment_log:
output:
report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}',
'alignment_'+ALIGNER+'.csv'), itemid=item_names),
flag = '_done_report',
run:
for item in item_names:
......@@ -582,7 +564,6 @@ rule report_alignment_log:
report_names=logs_name_item,
aligner_name=ALIGNER,
savetocsv=report_item)
shell('touch {output.flag} ')
# rule summarize_alignment_diagnose:
# input:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment