Unverified Commit 51c02b7d authored by Yun YAN's avatar Yun YAN Committed by GitHub
Browse files

Merge pull request #16 from Puriney/master

0.4.8
parents ec5d06b2 9e4ced0c
......@@ -77,6 +77,7 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
len_umi=6, len_bc=6, len_tx=35,
bc_qual_min=10,
is_gzip=True,
save_unknown_bc_fastq=False,
do_bc_rev_complement=False,
do_tx_rev_complement=False,
verbose=False):
......@@ -153,12 +154,13 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
try:
fhout = bc_fhout[cell_bc]
except KeyError:
fhout = bc_fhout['UNKNOWNBC_R1']
fhout.write('{}\n{}\n{}\n{}\n'.format(umibc_name, umibc_seq,
"+", umibc_qualstr))
fhout = bc_fhout['UNKNOWNBC_R2']
fhout.write('{}\n{}\n{}\n{}\n'.format(tx_name, tx_seq,
"+", tx_qualstr))
if save_unknown_bc_fastq:
fhout = bc_fhout['UNKNOWNBC_R1']
fhout.write('{}\n{}\n{}\n{}\n'.format(umibc_name, umibc_seq,
"+", umibc_qualstr))
fhout = bc_fhout['UNKNOWNBC_R2']
fhout.write('{}\n{}\n{}\n{}\n'.format(tx_name, tx_seq,
"+", tx_qualstr))
sample_counter['unknown'] += 1
continue
......@@ -184,20 +186,20 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
def write_demultiplexing(stats, dict_bc_id2seq, stats_fpath):
if stats_fpath is None:
stats_fpath = 'demultiplexing.log'
stats_fpath = 'demultiplexing.csv'
try:
fh_stats = open(stats_fpath, 'w')
except Exception as e:
raise Exception(e)
fh_stats.write('BC\tReads(#)\tReads(%)\n')
fh_stats.write('BC,Reads(#),Reads(%)\n')
for bc_id, bc_seq in dict_bc_id2seq.items():
# bc_id = '[{:04d}]'.format('-'.join(map(str, bc_id)))
formatter = '{:04d}-{}\t{}\t{:07.3f}\n'
formatter = '{:04d}-{},{},{:07.3f}\n'
fh_stats.write(formatter.format(bc_id, bc_seq, stats[bc_seq],
stats[bc_seq] / stats['total'] * 100))
formatter = '{}\t{}\t{:07.3f}\n'
formatter = '{},{},{:07.3f}\n'
fh_stats.write(formatter.format('saved', stats['saved'],
stats['saved'] / stats['total'] * 100))
fh_stats.write(formatter.format('unknown', stats['unknown'],
......@@ -247,6 +249,9 @@ def main():
help='Length of CELSeq barcode (default=6)')
parser.add_argument('--cut-length', metavar='N', type=int, default=35,
help='Length of read on R2 to be mapped. (default=35)')
parser.add_argument('--save-unknown-bc-fastq',
dest='save_unknown_bc_fastq', action='store_true')
parser.set_defaults(save_unknown_bc_fastq=False)
parser.add_argument('--verbose', dest='verbose', action='store_true')
parser.set_defaults(verbose=False)
......@@ -269,6 +274,7 @@ def main():
len_tx=args.cut_length,
bc_qual_min=args.min_bc_quality,
is_gzip=args.is_gzip,
save_unknown_bc_fastq=args.save_unknown_bc_fastq,
do_bc_rev_complement=False,
do_tx_rev_complement=False,
verbose=args.verbose)
......
......@@ -75,7 +75,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding',
transcript_id='tx0')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -88,7 +88,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding',
transcript_id='tx0', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -116,7 +116,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8',
tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8',
transcript_id='tx8', gene_biotype='lincRNA')
tx = gtf_str(chrm='chr2', src=src,
feature='transcript',
......@@ -129,7 +129,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8', gene_biotype='lincRNA',
exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8', gene_biotype='lincRNA',
transcript_id='tx8', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr2', src=src,
......@@ -157,7 +157,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA',
tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA',
transcript_id='tx1')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -170,7 +170,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(3, 0, -1):
exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA',
exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA',
transcript_id='tx1', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -198,7 +198,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.1')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -211,7 +211,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2, 0, -1):
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -226,7 +226,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
out.append(exon)
s_stream = e_stream + 1 + len_intron
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.2')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -238,7 +238,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=tx_attr)
fout.write('{}\n'.format(tx))
out.append(tx)
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.2', exon_num=1)
exon = gtf_str(chrm='chr1', src=src,
feature='exon',
......@@ -264,7 +264,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding',
transcript_id='tx3')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -277,7 +277,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding',
transcript_id='tx3', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -306,7 +306,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding',
transcript_id='tx4')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -319,7 +319,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding',
transcript_id='tx4', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -347,7 +347,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding',
transcript_id='tx5')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -360,7 +360,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(1, 0, -1):
exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding',
transcript_id='tx5', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......@@ -388,7 +388,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding',
transcript_id='tx6')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -400,7 +400,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=tx_attr)
fout.write('{}\n'.format(tx))
out.append(tx)
exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding',
transcript_id='tx6', exon_num=1)
exon = gtf_str(chrm='chr1', src=src,
feature='exon',
......@@ -424,7 +424,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding',
transcript_id='tx7')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
......@@ -437,7 +437,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding',
transcript_id='tx7', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
......
......@@ -35,7 +35,7 @@ def ymdhms():
def print_logger(msg):
localtime = time.asctime(time.localtime(time.time()))
# sys.stderr.write("[ {} ] {}\n".format(localtime, msg))
print("[ {} ] {}\n".format(localtime, msg))
print("[ {} ] {}\n".format(localtime, msg), flush=True)
def mkfolder(dirpath):
......
......@@ -5,6 +5,7 @@ import argparse
import pickle
from celseq2.helper import print_logger
# from genometools.ensembl.annotations import get_genes
def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon',
......@@ -50,8 +51,13 @@ def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon',
print_logger('Processed {:,} lines of GFF...'.format(i))
# Use genometools to select exported_genes
# if gene_types:
# exported_genes = get_genes(gff_fpath, valid_biotypes=set(gene_types))
# exported_genes = list(exported_genes['name'].values)
if exported_genes:
exported_genes = sorted(exported_genes)
exported_genes = tuple(sorted(exported_genes))
if dumpto:
with open(dumpto, 'wb') as fh:
pickle.dump((features, exported_genes), fh)
......
## CEL-seq2 Tech Setting ##
####################################
## Tech Setting
####################################
BC_INDEX_FPATH: '/absolute/path/to/wonderful_umi.tab'
BC_SEQ_COLUMN: 1
BC_SEQ_COLUMN: 0
BC_IDs_DEFAULT: '1-96'
UMI_START_POSITION: 0
UMI_LENGTH: 6
BC_START_POSITION: 6
BC_LENGTH: 6
## Tools ##
# 'bowtie2' 'star'
####################################
## Alignment Tools
####################################
## Which RNA-seq aligner to use? 'bowtie2' or 'star'
ALIGNER: 'bowtie2'
BOWTIE2_INDEX_PREFIX: '/absolute/path/to/bowtie2_index'
## What is the absolute path to command bowtie2?
BOWTIE2: '/absolute/path/to/bowtie2'
STAR_INDEX_DIR: '/absolute/path/to/star/folder'
## What is the sharef prefix of bowtie2 index?
BOWTIE2_INDEX_PREFIX: '/absolute/path/to/bowtie2_index'
## What is the absolute path to command STAR?
STAR: '/absolute/path/to/star'
## Whare is the directory to save STAR index?
STAR_INDEX_DIR: '/absolute/path/to/star/folder/'
####################################
## Annotations ##
# Path to GTF/GFF file
####################################
## Where is the GTF.gz/GFF.gz/GTF/GFF file?
GFF: '/absolute/path/to/wonderful.gtf.gz'
# Refer: http://htseq.readthedocs.io/en/master/count.html
## Refer: http://htseq.readthedocs.io/en/master/count.html
## What is considered the "gene"? 'gene_name' or 'gene_id'?
FEATURE_ID: 'gene_name'
## What is the content of the "gene"? Mostly 'exon'.
FEATURE_CONTENT: 'exon'
# GENE_BIOTYPE: ['protein_coding', 'lincRNA']
## Which type of genes to report?
GENE_BIOTYPE:
- 'protein_coding'
- 'lincRNA'
## If nothing set as below, all genes are reported.
## GENE_BIOTYPE:
## Demultiplexing ##
####################################
## Demultiplexing
####################################
FASTQ_QUAL_MIN_OF_BC: 10
CUT_LENGTH: 35
SAVE_UNKNOWN_BC_FASTQ: false
## Alignment ##
## UMI Count ##
####################################
## UMI Count
####################################
ALN_QUAL_MIN: 0
## Running Parameters ##
####################################
## Running Parameters
####################################
num_threads: 15
verbose: True
__version__ = '0.4.7'
__version__ = '0.4.8'
......@@ -18,6 +18,7 @@ import tempfile
import shutil
import os
## Inforamtion ##
# '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2'
DIR_PROJ = config.get('output_dir', None)
......@@ -53,6 +54,7 @@ if not GENE_BIOTYPE: GENE_BIOTYPE = ();
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10
CUT_LENGTH = config.get('CUT_LENGTH', None) # 35
SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False) # False
## Alignment ##
ALIGNER = config.get('ALIGNER', None) # 'bowtie2', 'star'
assert (ALIGNER), 'Error: Specify aligner.'
......@@ -65,8 +67,8 @@ STRANDED = config.get('stranded', 'yes')
## Running Parameters ##
# bc_ids_used=config.get('bc_ids_used', None) # '1,2,3,4-5'
num_threads = config.get('num_threads', None) # 5
verbose = config.get('verbose', None) # True
num_threads = config.get('num_threads', 16) # 5
verbose = config.get('verbose', True) # True
#######################
......@@ -107,10 +109,10 @@ SUBDIRS = [SUBDIR_INPUT,
SUBDIR_LOG, SUBDIR_QSUB, SUBDIR_DIAG, SUBDIR_ANNO
]
aln_diagnose_item = ["_unmapped",
"_low_map_qual", '_multimapped', "_uniquemapped",
"_no_feature", "_ambiguous",
"_total"]
# aln_diagnose_item = ["_unmapped",
# "_low_map_qual", '_multimapped', "_uniquemapped",
# "_no_feature", "_ambiguous",
# "_total"]
## Dev ##
#####################
......@@ -121,39 +123,16 @@ workdir: DIR_PROJ
rule all:
input:
'_done_annotation',
'_done_UMI',
'_done_report',
'_done_annotation',
'_done_umimatrix_per_experiment',
output:
touch('_DONE'),
run:
if glob.glob('celseq2_job*.sh*'):
mkfolder(SUBDIR_QSUB)
shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB))
rule COUNT_MATRIX:
input:
csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'),
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
output:
touch('_done_UMI')
message: 'Finished counting UMI-count matrix.'
run:
if ALIGNER == 'star':
shell('rm {}'.format('_done_star_genome_loaded'))
print('Free memory loaded by STAR', flush=True)
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = 'STAR '
cmd += '--genomeLoad Remove '
cmd += '--genomeDir {STAR_INDEX_DIR} '
cmd += '--outFileNamePrefix '
cmd += join_path(tmpdirname, '')
shell(cmd)
print_logger('UMI-count matrix is saved at {}'.format(input.csv))
'''
rule celseq2:
message: 'Finished Entire Pipeline.'
......@@ -201,112 +180,87 @@ rule celseq2:
print_logger('Expression UMI matrix is saved at {}'.format(input.csv))
'''
rule setup_dir:
input: SAMPLE_TABLE_FPATH
output:
touch('_done_setupdir'),
dir1 = SUBDIRS,
dir2 = expand(join_path('{subdir}', '{itemid}'),
subdir=[SUBDIR_INPUT,
SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_DIAG,
SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_LOG],
itemid=item_names),
dir3 = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', '{itemid}'),
zip, expid=sample_list, itemid=item_names),
message: 'Setting up project directory.'
run:
for d in output.dir1:
mkfolder(d)
for d in output.dir2:
mkfolder(d)
for d in output.dir3:
mkfolder(d)
# rule setup_dir:
# input: SAMPLE_TABLE_FPATH
# output:
# touch('_done_setupdir'),
# dir1 = SUBDIRS,
# dir2 = expand(join_path('{subdir}', '{itemid}'),
# subdir=[SUBDIR_INPUT,
# SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_DIAG,
# SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_LOG],
# itemid=item_names),
# dir3 = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', '{itemid}'),
# zip, expid=sample_list, itemid=item_names),
# message: 'Setting up project directory.'
# run:
# for d in output.dir1:
# mkfolder(d)
# for d in output.dir2:
# mkfolder(d)
# for d in output.dir3:
# mkfolder(d)
## HT-seq Count UMI ##
rule COOK_ANNOTATION:
rule COUNT_MATRIX:
input:
'_done_setupdir',
csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'),
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
output:
anno = join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.pickle'),
flag = '_done_annotation',
message: 'Cooking Annotation'
touch('_done_UMI')
message: 'Finished counting UMI-count matrix.'
run:
_ = cook_anno_model(GFF, feature_atrr=FEATURE_ID,
feature_type=FEATURE_CONTENT,
gene_types=GENE_BIOTYPE,
stranded=True,
dumpto=output.anno,
verbose=verbose)
shell('touch {output.flag}')
if ALIGNER == 'star':
shell('rm {}'.format('_done_star_genome_loaded'))
print('Free memory loaded by STAR', flush=True)
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = 'STAR '
cmd += '--genomeLoad Remove '
cmd += '--genomeDir {STAR_INDEX_DIR} '
cmd += '--outFileNamePrefix '
cmd += join_path(tmpdirname, '')
shell(cmd)
print_logger('UMI-count matrix is saved at {}'.format(input.csv))
# Combo-demultiplexing
rule COMBO_DEMULTIPLEXING:
rule REPORT_ALIGNMENT_LOG:
input:
flag2 = '_done_annotation',
flag1 = '_done_setupdir',
report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}',
'alignment_'+ALIGNER+'.csv'),
itemid=item_names),
output:
'_done_combodemultiplex', # dynamic() is not allowed if task being called on terminal
message: 'Performing combo-demultiplexing'
threads: len(item_names)
run:
# Demultiplx fastq in Process pool
p = Pool(threads)
for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2):
itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid)
try:
os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz'))
os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz'))
except OSError:
pass
itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid)
itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid,
'demultiplexing.log')
print_logger('Demultiplexing {}'.format(itemid))
cmd = " ".join(["bc_demultiplex",
itemr1,
itemr2,
"--bc-index {}".format(BC_INDEX_FPATH),
"--bc-seq-column {}".format(BC_SEQ_COLUMN),
"--bc-index-used {}".format(itembc),
"--min-bc-quality {}".format(FASTQ_QUAL_MIN_OF_BC),
"--umi-start-position {}".format(
UMI_START_POSITION),
"--bc-start-position {}".format(BC_START_POSITION),
"--umi-length {}".format(UMI_LENGTH),
"--bc-length {}".format(BC_LENGTH),
"--cut-length {}".format(CUT_LENGTH),
"--out-dir {}".format(itemid_fqs_dir),
"--is-gzip ",
"--stats-file {}".format(itemid_log)])
p.apply_async(shell, args=(cmd,))
p.close()
p.join()
shell('touch {output} ')
touch('_done_report')
# Combo-demultiplexing
rule combo_demultiplexing:
input:
flag2 = '_done_annotation',
flag1 = '_done_setupdir',
input: SAMPLE_TABLE_FPATH,
# flag2 = '_done_annotation',
# flag1 = '_done_setupdir',
output:
fq = dynamic(join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemid}', '{bc}.fastq')),
message: 'Performing combo-demultiplexing'
threads: len(item_names)
params:
jobs = len(item_names),
save_unknown_bc_fastq = SAVE_UNKNOWN_BC_FASTQ,
run:
# Demultiplx fastq in Process pool
p = Pool(threads)
p = Pool(params.jobs)
for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2):
itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid)
mkfolder(itemid_in)
try:
os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz'))
os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz'))
except OSError:
pass
itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid)
itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid,
'demultiplexing.log')
mkfolder(join_path(DIR_PROJ, SUBDIR_REPORT, itemid))
itemid_log = join_path(DIR_PROJ, SUBDIR_REPORT, itemid,
'demultiplexing.csv')
print_logger('Demultiplexing {}'.format(itemid))
cmd = " ".join(["bc_demultiplex",
itemr1,
......@@ -324,6 +278,9 @@ rule combo_demultiplexing:
"--out-dir {}".format(itemid_fqs_dir),
"--is-gzip ",
"--stats-file {}".format(itemid_log)])
if params.save_unknown_bc_fastq:
cmd += ' --save-unknown-bc-fastq '
p.apply_async(shell, args=(cmd,))
p.close()