Unverified Commit 86012144 authored by Yun YAN's avatar Yun YAN Committed by GitHub
Browse files

Merge pull request #17 from Puriney/master

0.5.1
parents 51c02b7d 4a9d0080
......@@ -16,9 +16,9 @@ Select top-used parameters from snakemake.snakemake():
* -p
* -r
* -T
* -w 1800 # 30min
* --keep-going
* --restart-times 1
* -w 300 # 5min
# * --keep-going (no longer True)
* --restart-times 2
Select optional parameters from snakemake.snakemake():
* --ri v.s. --ii
......@@ -27,6 +27,7 @@ Select optional parameters from snakemake.snakemake():
* -j
* --cluster
* -n
* --notemp (--nt)
Refs:
- https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html
......@@ -64,6 +65,11 @@ def get_argument_parser():
action="store_true", default=False,
help="Read has to be mapped to the opposite strand as the feature")
parser.add_argument(
"--celseq2-to-st", "--st",
action="store_true", default=False,
help="Rotate the UMI-count matrix to a shape of spots by genes.")
parser.add_argument(
"--cores", "--jobs", "-j",
action="store",
......@@ -98,7 +104,11 @@ def get_argument_parser():
"--ignore-incomplete", "--ii",
action="store_true",
help="Do not check for incomplete output files.")
parser.add_argument(
"--keep-temp", dest='keep_temp',
action="store_true",
help="Keep the intermediate files after run.")
parser.set_defaults(keep_temp=False)
parser.add_argument(
"--version", "-v",
action="version",
......@@ -120,15 +130,17 @@ def main():
configfile=args.config_file,
config={'output_dir': args.output_dir,
'experiment_table': args.experiment_table,
'stranded': stranded},
'stranded': stranded,
'run_celseq2_to_st': args.celseq2_to_st,
'keep_intermediate': args.keep_temp},
printshellcmds=True,
printreason=True,
timestamp=True,
latency_wait=1800,
latency_wait=300,
jobname="celseq2_job.{rulename}.{jobid}.sh",
keepgoing=True,
restart_times=1,
keepgoing=False,
restart_times=2,
dryrun=args.dryrun,
lock=not args.nolock,
......@@ -139,7 +151,8 @@ def main():
nodes=args.cores,
force_incomplete=args.rerun_incomplete,
ignore_incomplete=args.ignore_incomplete)
ignore_incomplete=args.ignore_incomplete,
notemp=args.keep_temp)
sys.exit(0 if success else 1)
......
......@@ -77,7 +77,8 @@ def main_new_experiment_table():
def get_workflow_file_string(mode='multiple'):
if mode == 'multiple':
fname = 'celseq2.snakemake'
# fname = 'celseq2.snakemake'
fname = 'celseq2_beta.snakemake'
else:
fname = 'celseq2_single_lib.snakemake'
fstring = resource_string('celseq2', 'workflow/{}'.format(fname))
......@@ -86,7 +87,8 @@ def get_workflow_file_string(mode='multiple'):
def get_workflow_file_fpath(mode='multiple'):
if mode == 'multiple':
fname = 'celseq2.snakemake'
# fname = 'celseq2.snakemake'
fname = 'celseq2_beta.snakemake'
else:
fname = 'celseq2_single_lib.snakemake'
fpath = resource_filename('celseq2', 'workflow/{}'.format(fname))
......
#!/usr/bin/env python3
import pysam
import argparse
from celseq2.helper import print_logger
from celseq2.helper import join_path
from celseq2.demultiplex import bc_dict_id2seq, str2int
def _cell_seq (name, length=6):
# BC-TCTGAG_UMI-CGTTAC => TCTGAG
try:
out = name.split('_')[0][3:3 + length]
except Exception as e:
raise(e)
return(out)
def demultiplex_sam (samfile, outdir, bc_length):
if not samfile:
return
samobj = pysam.AlignmentFile(samfile, 'rb')
dict_samout = {}
for aln in samobj:
bc = _cell_seq(aln.query_name, length=bc_length)
fh = dict_samout.get(bc, None)
if not fh:
outsam = join_path(outdir, bc + '.sam')
fh = pysam.AlignmentFile(outsam, 'w', template=samobj)
dict_samout[bc] = fh
fh.write(aln)
for _, fh in dict_samout.items():
fh.close()
def demultiplex_sam_with_claim (samfile, outdir, bc_length, claimed_bc):
if not samfile:
return
if not claimed_bc:
return
samobj = pysam.AlignmentFile(samfile, 'rb')
dict_samout = {}
for bc in claimed_bc:
fh = pysam.AlignmentFile(
join_path(outdir, bc + '.sam'),
'w', template=samobj)
dict_samout[bc] = fh
for aln in samobj:
bc = _cell_seq(aln.query_name, length=bc_length)
fh = dict_samout.get(bc, None)
if not fh:
continue
fh.write(aln)
for _, fh in dict_samout.items():
fh.close()
def main():
parser = argparse.ArgumentParser(add_help=True)
parser.add_argument('--sbam', type=str, metavar='FILENAME',
help='File path to SAM/BAM file')
parser.add_argument('--savetodir', type=str, metavar='DIRNAME',
help='Directory path to save the demultiplexed SAMs.',
default='.')
parser.add_argument('--bc-length', type=int, metavar='N',
help='Length of cell barcode.', default=6)
parser.add_argument('--claim', action='store_true', dest='claim')
parser.set_defaults(claim=False)
parser.add_argument('--bc-index', type=str, metavar='FILENAME',
help='File path to barcode dictionary.')
parser.add_argument('--bc-seq-column', type=int, metavar='N',
default=0,
help=('Column of cell barcode dictionary file '
'which tells the actual sequences.'))
parser.add_argument('--bc-index-used', type=str, metavar='string',
default='1-96',
help='Index of used barcode IDs (default=1-96)')
args = parser.parse_args()
print_logger('Demultiplexing SAM/BAM starts {} ...'.format(args.sbam))
if args.claim:
all_bc_dict = bc_dict_id2seq(args.bc_index, args.bc_seq_column)
bc_index_used = str2int(args.bc_index_used)
bc_seq_used = [all_bc_dict.get(x, None) for x in bc_index_used]
demultiplex_sam_with_claim(
samfile=args.sbam,
outdir=args.savetodir,
bc_length=args.bc_length,
claimed_bc=bc_seq_used)
else:
demultiplex_sam(
samfile=args.sbam,
outdir=args.savetodir,
bc_length=args.bc_length)
print_logger('Demultiplexing SAM/BAM ends. See: {}'.format(args.savetodir))
if __name__ == "__main__":
main()
......@@ -37,18 +37,23 @@ def celseq2stpipeline(celseq2_fpath, spatial_map, out,
genes = map(lambda x: x.replace(' ', '_'), expr_valid.index.values)
colnames = expr_valid.columns.values
fhout.write('{}\t{}\n'.format('', '\t'.join(genes))) # header
# fhout.write('{}\t{}\n'.format('', '\t'.join(genes))) # header
fhout.write('{}\t{}\t{}\n'.format('Row', 'Col', '\t'.join(genes))) # header
for colname in colnames:
tmp = colname.replace('.', '-')
spot_seq = tmp.split('-')[-1]
tmp = colname.replace('.', '-') # BC-1-ATGC or ATGC
spot_seq = tmp.split('-')[-1] # ATGC or ATGC
spot_expr = expr_valid[colname].values
spot_xy = dict_spatial_seq2xy.get(spot_seq, None)
if not spot_xy:
continue
spot_xy = 'x'.join(map(str, spot_xy))
fhout.write('{}\t{}\n'.format(
spot_xy,
# spot_xy = 'x'.join(map(str, spot_xy))
# fhout.write('{}\t{}\n'.format(
# spot_xy,
# '\t'.join(map(str, spot_expr))))
spot_r, spot_c = spot_xy
fhout.write('{}\t{}\t{}\n'.format(
spot_r, spot_c,
'\t'.join(map(str, spot_expr))))
fhout.close()
......
## Inforamtion ##
PROJECT_NAME: 'wonderful_sc'
DIR_PROJ: '/absolute/path/to/saved/dir'
## Sample ##
R1: '/absolute/path/to/r1.fastq.gz'
R2: '/absolute/path/to/r2.fastq.gz'
## Cell Barcodes ID
bc_ids_used: '1,2,3,4-5'
## CEL-seq2 Tech Setting ##
BC_INDEX_FPATH: '/absolute/path/to/wonderful_umi.tab'
BC_IDs_DEFAULT: '1-96'
UMI_LENGTH: 6
BC_LENGTH: 6
## Tools ##
BOWTIE2_INDEX_PREFIX: '/absolute/path/to/bowtie2_index'
BOWTIE2: '/absolute/path/to/bowtie2'
## Annotations ##
GFF: '/absolute/path/to/wonderful.gtf.gz'
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC: 10
CUT_LENGTH: 35
## Alignment ##
ALIGNER: 'bowtie2'
## UMI Count ##
ALN_QUAL_MIN: 0
## Running Parameters ##
num_threads: 5
verbose: True
__version__ = '0.4.8'
__version__ = '0.5.1'
......@@ -70,6 +70,7 @@ STRANDED = config.get('stranded', 'yes')
num_threads = config.get('num_threads', 16) # 5
verbose = config.get('verbose', True) # True
RUN_CELSEQ2_TO_ST = config.get('run_celseq2_to_st', False)
#######################
## Pipeline reserved ##
......@@ -91,15 +92,16 @@ if R1 is None or R2 is None:
exit(1)
SUBDIR_INPUT = 'input'
SUBDIR_FASTQ = 'small_fq'
SUBDIR_ALIGN = 'small_sam'
SUBDIR_UMI_CNT = 'small_umi_count'
SUBDIR_UMI_SET = 'small_umi_set'
SUBDIR_FASTQ = '.small_fq'
SUBDIR_ALIGN = '.small_sam'
SUBDIR_UMI_CNT = '.small_umi_count'
SUBDIR_UMI_SET = '.small_umi_set'
SUBDIR_EXPR = 'expr'
SUBDIR_LOG = 'small_log'
SUBDIR_ST = 'ST'
SUBDIR_LOG = '.small_log'
SUBDIR_QSUB = 'qsub_log'
SUBDIR_DIAG = 'small_diagnose'
SUBDIR_ANNO = 'annotation'
SUBDIR_DIAG = '.small_diagnose'
SUBDIR_ANNO = '.annotation'
SUBDIR_REPORT = 'report'
SUBDIRS = [SUBDIR_INPUT,
......@@ -121,86 +123,31 @@ SUBDIRS = [SUBDIR_INPUT,
workdir: DIR_PROJ
rule all:
input:
'_done_annotation',
'_done_UMI',
'_done_report',
output:
touch('_DONE'),
run:
if glob.glob('celseq2_job*.sh*'):
mkfolder(SUBDIR_QSUB)
shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB))
'''
rule celseq2:
message: 'Finished Entire Pipeline.'
input:
# Annotation
anno = join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.pickle'),
# anno = rules.cook_annotation.output.anno,
# Expression Matrix per experiment/sample/plate
# csv = rules.summarize_umi_matrix_per_experiment.output.csv,
# hdf = rules.summarize_umi_matrix_per_experiment.output.hdf,
csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'),
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
# Expression Matrix per item/pair-of-reads/lane
# csv_item = rules.summarize_umi_matrix_per_item.output.csv_item,
# hdf_item = rules.summarize_umi_matrix_per_item.output.hdf_item,
csv_item = expand(join_path(DIR_PROJ, SUBDIR_EXPR,
'{expid}', '{itemid}', 'expr.csv'), zip,
expid=sample_list, itemid=item_names),
hdf_item = expand(join_path(DIR_PROJ, SUBDIR_EXPR,
'{expid}', '{itemid}', 'expr.h5'), zip,
expid=sample_list, itemid=item_names),
# Diagnose of demultiplexing
# demultiplexing = expand(join_path(DIR_PROJ, SUBDIR_DIAG,
# '{itemid}', 'demultiplexing.log'),
# itemid=item_names),
# # Diagnose of alignment
alignment = expand(join_path(DIR_PROJ, SUBDIR_DIAG,
'{itemid}', 'alignment_diagnose.csv'),
itemid=item_names),
output:
touch('_done_celseq2')
run:
if ALIGNER == 'star':
shell('rm {}'.format('_done_star_genome_loaded'))
print('Free memory loaded by STAR', flush=True)
cmd = 'STAR '
cmd += '--genomeLoad Remove '
cmd += '--genomeDir {STAR_INDEX_DIR} '
shell(cmd)
print_logger('Expression UMI matrix is saved at {}'.format(input.csv))
'''
# rule setup_dir:
# input: SAMPLE_TABLE_FPATH
# output:
# touch('_done_setupdir'),
# dir1 = SUBDIRS,
# dir2 = expand(join_path('{subdir}', '{itemid}'),
# subdir=[SUBDIR_INPUT,
# SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_DIAG,
# SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_LOG],
# itemid=item_names),
# dir3 = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', '{itemid}'),
# zip, expid=sample_list, itemid=item_names),
# message: 'Setting up project directory.'
# run:
# for d in output.dir1:
# mkfolder(d)
# for d in output.dir2:
# mkfolder(d)
# for d in output.dir3:
# mkfolder(d)
if RUN_CELSEQ2_TO_ST:
rule all:
input:
# '_done_annotation',
'_done_UMI',
'_done_report',
'_done_ST',
output:
touch('_DONE'),
run:
if glob.glob('celseq2_job*.sh*'):
mkfolder(SUBDIR_QSUB)
shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB))
else:
rule all:
input:
# '_done_annotation',
'_done_UMI',
'_done_report',
output:
touch('_DONE'),
run:
if glob.glob('celseq2_job*.sh*'):
mkfolder(SUBDIR_QSUB)
shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB))
rule COUNT_MATRIX:
......@@ -213,18 +160,58 @@ rule COUNT_MATRIX:
touch('_done_UMI')
message: 'Finished counting UMI-count matrix.'
run:
if ALIGNER == 'star':
shell('rm {}'.format('_done_star_genome_loaded'))
print('Free memory loaded by STAR', flush=True)
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = 'STAR '
cmd += '--genomeLoad Remove '
cmd += '--genomeDir {STAR_INDEX_DIR} '
cmd += '--outFileNamePrefix '
cmd += join_path(tmpdirname, '')
shell(cmd)
# if ALIGNER == 'star':
# shell('rm {}'.format('_done_star_genome_loaded'))
# print('Free memory loaded by STAR', flush=True)
# with tempfile.TemporaryDirectory() as tmpdirname:
# cmd = 'STAR '
# cmd += '--genomeLoad Remove '
# cmd += '--genomeDir {STAR_INDEX_DIR} '
# cmd += '--outFileNamePrefix '
# cmd += join_path(tmpdirname, '')
# shell(cmd)
if glob.glob('celseq2_job*.sh*'):
mkfolder(SUBDIR_QSUB)
shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB))
print_logger('UMI-count matrix is saved at {}'.format(input.csv))
if RUN_CELSEQ2_TO_ST:
rule CELSEQ2_TO_ST:
input:
tsv = expand(join_path(DIR_PROJ, SUBDIR_ST, '{expid}', 'ST.tsv'),
expid=list(set(sample_list))),
message: 'Convert to ST format.'
output:
touch('_done_ST')
run:
if glob.glob('celseq2_job*.sh*'):
mkfolder(SUBDIR_QSUB)
shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB))
rule _celseq2_to_st:
input:
hdf = join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
flag = '_done_UMI',
output:
tsv = join_path(DIR_PROJ, SUBDIR_ST, '{expid}', 'ST.tsv'),
message: 'UMI-count matrix for ST: {output.tsv} '
params:
exclude_empty_spots = False,
exclude_nondetected_genes = False,
run:
cmd = 'celseq2-to-st {input.hdf} '
cmd += ' {} '.format(BC_INDEX_FPATH)
cmd += ' {output.tsv} '
if params.exclude_empty_spots:
cmd += ' --exclude-empty-spots '
if params.exclude_nondetected_genes:
cmd += ' --exclude-nondetected-genes '
shell(cmd)
rule REPORT_ALIGNMENT_LOG:
input:
report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}',
......@@ -326,21 +313,21 @@ if ALIGNER == 'bowtie2':
if ALIGNER == 'star':
rule star_load_genome:
output:
flag = '_done_star_genome_loaded',
message: 'Loading genome to memory for STAR'
shadow: "shallow"
run:
cmd = 'STAR '
cmd += '--genomeLoad LoadAndExit '
cmd += '--genomeDir {STAR_INDEX_DIR} '
cmd += '&& echo loaded >> {output.flag} '
shell(cmd)
# rule star_load_genome:
# output:
# flag = '_done_star_genome_loaded',
# message: 'Loading genome to memory for STAR'
# shadow: "shallow"
# run:
# cmd = 'STAR '
# cmd += '--genomeLoad LoadAndExit '
# cmd += '--genomeDir {STAR_INDEX_DIR} '
# cmd += '&& echo loaded >> {output.flag} '
# shell(cmd)
rule align_star:
input:
flag = rules.star_load_genome.output.flag, #'_done_star_genome_loaded',
# flag = rules.star_load_genome.output.flag, #'_done_star_genome_loaded',
fq = join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemid}', '{bc}.fastq'),
output:
sam = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemid}', '{bc}.sam'),
......@@ -357,7 +344,7 @@ if ALIGNER == 'star':
run:
cmd = 'STAR '
cmd += ' --runRNGseed 42 '
cmd += ' --genomeLoad LoadAndKeep '
cmd += ' --genomeLoad NoSharedMemory '
cmd += ' --runThreadN {params.threads} '
cmd += ' --genomeDir {STAR_INDEX_DIR} '
# cmd += ' --readFilesCommand zcat '
......@@ -389,9 +376,11 @@ rule COOK_ANNOTATION:
input:
SAMPLE_TABLE_FPATH,
output:
anno_pkl = join_path(DIR_PROJ, SUBDIR_ANNO, base_name(GFF) + '.pickle'),
anno_csv = join_path(DIR_PROJ, SUBDIR_ANNO, base_name(GFF) + '.csv'),
flag = '_done_annotation',
anno_pkl = temp(join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.pickle')),
anno_csv = temp(join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.csv')),
# flag = '_done_annotation',
params:
gene_type = GENE_BIOTYPE
priority: 100
......@@ -421,20 +410,16 @@ rule COOK_ANNOTATION:
with open(output.anno_pkl, 'wb') as fh:
pickle.dump( (features, exported_genes), fh)
shell('touch {output.flag}')
rule count_umi:
input:
# gff = join_path(DIR_PROJ, SUBDIR_ANNO,
# base_name(GFF) + '.pickle'),
# flag = '_done_annotation',
gff = rules.COOK_ANNOTATION.output.anno_pkl,
sam = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemid}', '{bc}.sam'),
output:
umicnt = join_path(DIR_PROJ, SUBDIR_UMI_CNT, '{itemid}', '{bc}.pkl'),
umiset = join_path(DIR_PROJ, SUBDIR_UMI_SET, '{itemid}', '{bc}.pkl'),
aln_diag = join_path(DIR_PROJ, SUBDIR_DIAG, '{itemid}', '{bc}.pkl'),
umicnt = temp(join_path(DIR_PROJ, SUBDIR_UMI_CNT,
'{itemid}', '{bc}.pkl')),
umiset = temp(join_path(DIR_PROJ, SUBDIR_UMI_SET,
'{itemid}', '{bc}.pkl')),
message: 'Counting {input.sam}'
run:
features_f, _ = pickle.load(open(input.gff, 'rb'))
......@@ -446,14 +431,11 @@ rule count_umi:
dumpto=None)
pickle.dump(umi_cnt, open(output.umicnt, 'wb'))
pickle.dump(umi_set, open(output.umiset, 'wb'))
pickle.dump(aln_cnt, open(output.aln_diag, 'wb'))
# - regular umi-count using *_umicnt.pkl -> umi_count_matrix replicates/lanes per plate
rule summarize_umi_matrix_per_item:
input:
# gff = join_path(DIR_PROJ, SUBDIR_ANNO,
# base_name(GFF) + '.pickle'),
gff = rules.COOK_ANNOTATION.output.anno_pkl,
umicnt = dynamic(join_path(DIR_PROJ, SUBDIR_UMI_CNT,
'{itemid}', '{bc}.pkl')),
......@@ -465,7 +447,6 @@ rule summarize_umi_matrix_per_item:
hdf_item = expand(join_path(DIR_PROJ, SUBDIR_EXPR,
'{expid}', '{itemid}', 'expr.h5'), zip,
expid=sample_list, itemid=item_names),
flag = '_done_umimatrix_per_item',
run:
_, export_genes = pickle.load(open(input.gff, 'rb'))
......@@ -490,8 +471,6 @@ rule summarize_umi_matrix_per_item:
expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR,
exp_id, item_id, 'expr.h5'), 'table')
shell('touch {output.flag} ')
# - merge umi-count using *_umiset.pkl -> correct umi count per experiment/plate
rule summarize_umi_matrix_per_experiment:
......@@ -507,7 +486,7 @@ rule summarize_umi_matrix_per_experiment:
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
flag = '_done_umimatrix_per_experiment',
# flag = '_done_umimatrix_per_experiment',
run:
_, export_genes = pickle.load(open(input.gff, 'rb'))
......@@ -545,7 +524,7 @@ rule summarize_umi_matrix_per_experiment:
expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR,
exp_id, 'expr.h5'), 'table')
shell('touch {output.flag} ')
# shell('touch {output.flag} ')
rule report_alignment_log:
......@@ -576,33 +555,6 @@ rule report_alignment_log:
aligner_name=ALIGNER,
savetocsv=report_item)
# rule summarize_alignment_diagnose:
# input:
# aln_diag = dynamic(join_path(DIR_PROJ, SUBDIR_DIAG,
# '{itemid}', '{bc}.pkl')),
# output: