Commit e57d687e authored by Yun YAN's avatar Yun YAN Committed by GitHub
Browse files

Merge pull request #4 from Puriney/ST

Merge features supporting ST
parents 364c1657 db05ce4d
......@@ -27,29 +27,55 @@ def str2int(s):
return(sorted(list(set(out))))
def bc_dict_seq2id(bc_index_fpath):
def bc_dict_seq2id(bc_index_fpath, col_seq=None):
""" dict[barcode_seq] = barcode_id """
if col_seq is None:
col_seq = 0
out = dict()
with open(bc_index_fpath, 'rt') as fin:
next(fin)
out = map(lambda row: row.strip().split(), fin)
out = {row[1]: int(row[0]) for row in out}
# next(fin) # remove header
# out = list(map(lambda row: row.strip().split(), fin))
# out = {row[1]: int(row[0]) for row in out}
row_num = 1
for row in fin:
if row.startswith('#'):
continue
row = row.strip().split()
print('{}:{}'.format(row_num, row))
row_val = row[col_seq]
row_key = row_num
out[row_key] = row_val
row_num += 1
return(out)
def bc_dict_id2seq(bc_index_fpath):
def bc_dict_id2seq(bc_index_fpath, col_seq=None):
""" dict[barcode_id] = barcode_seq"""
if col_seq is None:
col_seq = 0
out = dict()
with open(bc_index_fpath, 'rt') as fin:
next(fin)
out = map(lambda row: row.strip().split(), fin)
out = {int(row[0]): row[1] for row in out}
# next(fin) # remove header
# out = map(lambda row: row.strip().split(), fin)
# out = {int(row[0]): row[1] for row in out}
row_num = 1
for row in fin:
if row.startswith('#'):
continue
row = row.strip().split()
print('{}:{}'.format(row_num, row))
row_val = row[col_seq]
row_key = row_num
out[row_key] = row_val
row_num += 1
return(out)
def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
outdir,
len_umi=6, len_bc=6, len_tx=35, bc_qual_min=10,
start_umi=0, start_bc=6,
len_umi=6, len_bc=6, len_tx=35,
bc_qual_min=10,
is_gzip=True,
do_bc_rev_complement=False,
do_tx_rev_complement=False,
......@@ -70,6 +96,7 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
bc_fhout = dict()
for bc_id, bc_seq in dict_bc_id2seq.items():
# bc_id = '[{}]'.format('-'.join(map(str, bc_id)))
bc_fhout[bc_seq] = join_path(outdir,
'BC-{}-{}.fastq'.format(bc_id, bc_seq))
mkfolder(join_path(outdir, 'UNKNOWN'))
......@@ -108,18 +135,21 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
sample_counter['total'] += 1
if len(umibc_seq) < len_umi + len_bc:
umibc_idx = sorted(list(set(range(start_umi, start_umi + len_umi)) |
set(range(start_bc, start_bc + len_bc))))
if len(umibc_seq) < len(umibc_idx):
continue
umibc_min_qual = min(
(ord(c) - 33 for c in umibc_qualstr[:(len_umi + len_bc)]))
umibc_min_qual = min((ord(umibc_qualstr[i]) - 33 for i in umibc_idx))
if umibc_min_qual < bc_qual_min:
continue
sample_counter['qualified'] += 1
umi, cell_bc = umibc_seq[0:len_umi], umibc_seq[len_umi:(
len_umi + len_bc)]
umi = umibc_seq[start_umi:(start_umi + len_umi)]
cell_bc = umibc_seq[start_bc:(start_bc + len_bc)]
try:
fhout = bc_fhout[cell_bc]
except KeyError:
......@@ -160,12 +190,14 @@ def write_demultiplexing(stats, dict_bc_id2seq, stats_fpath):
except Exception as e:
raise Exception(e)
fh_stats.write('BC\tReads(#)\tReads(%)\n')
for bc_id, bc_seq in dict_bc_id2seq.items():
formatter = '{:03d}-{}\t{:,}\t{:06.2f}\n'
# bc_id = '[{:04d}]'.format('-'.join(map(str, bc_id)))
formatter = '{:04d}-{}\t{}\t{:07.3f}\n'
fh_stats.write(formatter.format(bc_id, bc_seq, stats[bc_seq],
stats[bc_seq] / stats['total'] * 100))
formatter = '{}\t{}\t{:06.2f}\n'
formatter = '{}\t{}\t{:07.3f}\n'
fh_stats.write(formatter.format('saved', stats['saved'],
stats['saved'] / stats['total'] * 100))
fh_stats.write(formatter.format('unknown', stats['unknown'],
......@@ -184,6 +216,10 @@ def main():
parser.add_argument('read2_fpath', type=str)
parser.add_argument('--bc-index', type=str, metavar='FILENAME',
help='File path to barcode dictionary')
parser.add_argument('--bc-seq-column', type=int, metavar='N',
default=0,
help=('Column of cell barcode dictionary file '
'which tells the actual sequences.'))
parser.add_argument('--bc-index-used', type=str, metavar='string',
default='1-96',
help='Index of used barcode IDs (default=1-96)')
......@@ -197,28 +233,37 @@ def main():
parser.add_argument('--stats-file', metavar='STATFILE',
type=str, default='demultiplexing.log',
help='Statistics (default: demultiplexing.log)')
parser.add_argument('--umi-length', metavar='N', type=int, default=0,
help='Length of UMI (default=0, e.g. no UMI)')
parser.add_argument('--bc-length', metavar='N', type=int, default=8,
help='Length of CELSeq barcode (default=8)')
parser.add_argument('--umi-start-position',
metavar='N', type=int, default=0,
help=('Start index of UMI on R1. '
'Default: 0. (0-based).'))
parser.add_argument('--umi-length', metavar='N', type=int, default=6,
help='Length of UMI (default=6)')
parser.add_argument('--bc-start-position',
metavar='N', type=int, default=6,
help=('Start index of cell barcode on R1. '
'Default: 6. (0-based).'))
parser.add_argument('--bc-length', metavar='N', type=int, default=6,
help='Length of CELSeq barcode (default=6)')
parser.add_argument('--cut-length', metavar='N', type=int, default=35,
help='Length of mapped read (default=35)')
help='Length of read on R2 to be mapped. (default=35)')
parser.add_argument('--verbose', dest='verbose', action='store_true')
parser.set_defaults(verbose=False)
args = parser.parse_args()
bc_dict = bc_dict_id2seq(args.bc_index)
bc_dict = bc_dict_id2seq(args.bc_index, args.bc_seq_column)
if args.bc_index_used != '1-96':
bc_index_used = str2int(args.bc_index_used)
bc_dict = {x: bc_dict.get(x, None) for x in bc_index_used}
bc_index_used = str2int(args.bc_index_used)
bc_dict = {x: bc_dict.get(x, None) for x in bc_index_used}
print_logger('Demultiplexing starts {}--{} ...'.format(args.read1_fpath,
args.read2_fpath))
out = demultiplexing(read1_fpath=args.read1_fpath,
read2_fpath=args.read2_fpath,
outdir=args.out_dir, dict_bc_id2seq=bc_dict,
start_umi=args.umi_start_position,
start_bc=args.bc_start_position,
len_umi=args.umi_length,
len_bc=args.bc_length,
len_tx=args.cut_length,
......
#!/usr/bin/env python3
import argparse
from .helper import print_logger
from .helper import filehandle_fastq_gz
from collections import Counter
def get_dict_bc_has_reads(r1, bc_index, bc_seq_col):
print(r1)
with open(bc_index, 'rt') as fin:
# next(fin)
rows = map(lambda row: row.strip().split(), fin)
known_bc = set([row[bc_seq_col] for row in rows])
print_logger('There are {} different cell barcodes.'.format(len(known_bc)))
res = Counter({bc: 0 for bc in known_bc})
bc_len = len(next(iter(res)))
fh_r1 = filehandle_fastq_gz(r1) if r1.endswith('.gz') else open(r1, 'r')
i = 0
while True:
if i % 1000000 == 0:
print_logger('Processing {:,} reads...'.format(i))
try:
_ = next(fh_r1).rstrip()
r1_seq = next(fh_r1).rstrip()
_ = next(fh_r1).rstrip()
_ = next(fh_r1).rstrip()
i += 1
r1_bc = r1_seq[:bc_len]
if not r1_bc:
continue
if r1_bc in known_bc:
res[r1_bc] += 1
else:
res['unknown'] += 1
except StopIteration:
break
print_logger('Processed total {:,} reads...'.format(i))
fh_r1.close()
return res
def main():
parser = argparse.ArgumentParser(add_help=True)
parser.add_argument(
'--bc-index', metavar='FILENAME', type=str,
help=('File path to cell barcode index.'))
parser.add_argument(
'--bc-seq-col', metavar='N', default=1, type=int,
help=('Column index of cell barcode index file to find the sequence',
' of cell barcodes. Default: 1 (2nd column).'))
parser.add_argument(
'--r1', metavar='FILENAME', type=str,
help=('File path to R1.'))
parser.add_argument(
'-o', '--output',
metavar='FILENAME', type=str,
required=True,
help=('File path save output log.'))
args = parser.parse_args()
if args.r1 and args.bc_index:
counter_bc_size = get_dict_bc_has_reads(args.r1,
args.bc_index,
args.bc_seq_col)
fhout = open(args.output, 'w')
tot = sum([counter_bc_size[x] for x in counter_bc_size])
bc_size_max, bc_size_min = float('-inf'), float('inf')
for bc in counter_bc_size:
if bc != 'unknown' and counter_bc_size[bc] > bc_size_max:
bc_size_max = counter_bc_size[bc]
if bc != 'unknown' and counter_bc_size[bc] < bc_size_min:
bc_size_min = counter_bc_size[bc]
fhout.write('{:>{}}\t{:,}\t{:06.2f}\n'.format(
bc, 20,
counter_bc_size[bc], counter_bc_size[bc] * 100 / tot))
valid_bc_size_val = [counter_bc_size[x]
for x in counter_bc_size if x != 'unknown']
bc_size_avg = sum([x / len(valid_bc_size_val)
for x in valid_bc_size_val])
fhout.write('{:>{}}\t{:,}\t{:06.2f}\n'.format(
'bc_size_max', 20,
bc_size_max, bc_size_max * 100 / tot))
fhout.write('{:>{}}\t{:,}\t{:06.2f}\n'.format(
'bc_size_min', 20,
bc_size_min, bc_size_min * 100 / tot))
fhout.write('{:>{}}\t{:06.2f}\t{:06.2f}\n'.format(
'bc_size_avg', 20,
bc_size_avg, bc_size_avg * 100 / tot))
fhout.write('{:>{}}\t{:,}\t{:06.2f}\n'.format(
'total', 20,
tot, tot * 100 / tot))
fhout.close()
if __name__ == '__main__':
main()
#!/usr/bin/env python3
'''
Support Spatial Transcriptome Pipeline output
https://github.com/SpatialTranscriptomicsResearch/st_pipeline
'''
import argparse
import pandas as pd
def celseq2stpipeline(celseq2_hdf5, spatial_map, out):
fhout = open(out, 'w')
dict_spatial_seq2xy = {}
with open(spatial_map, 'r') as fin:
for row in fin:
if row.startswith('#'):
continue
row = row.strip().split()
dict_spatial_seq2xy[row[0]] = (row[1], row[2])
expr = pd.read_hdf(celseq2_hdf5, 'table') # genes x cells
# df.loc[(df.sum(axis=1) != 0), (df.sum(axis=0) != 0)]
expr_valid = expr.loc[(expr.sum(axis=1) != 0), (expr.sum(axis=0) != 0)]
genes = expr_valid.index.values
colnames = expr_valid.columns.values
fhout.write('{}\t{}\n'.format('', '\t'.join(genes))) # header
for colname in colnames:
spot_seq = colname.split('-')[-1]
spot_expr = expr_valid[colname].values
spot_xy = dict_spatial_seq2xy.get(spot_seq, None)
if not spot_xy:
continue
spot_xy = 'x'.join(map(str, spot_xy))
fhout.write('{}\t{}\n'.format(
spot_xy,
'\t'.join(map(str, spot_expr))))
fhout.close()
print(out)
def main():
parser = argparse.ArgumentParser(add_help=True)
parser.add_argument('celseq2', metavar='FILENAME', type=str,
help=('File path to expr.hdf5 generated by celseq2.'))
parser.add_argument('spatial_map', metavar='FILENAME', type=str,
help='File path to spatial position dictionary.')
parser.add_argument('out', metavar='FILENAME', type=str,
help=('File path to save st_pipeline readable'
' output in tsv.'))
args = parser.parse_args()
celseq2stpipeline(args.celseq2, args.spatial_map, args.out)
if __name__ == "__main__":
main()
## CEL-seq2 Tech Setting ##
BC_INDEX_FPATH: '/absolute/path/to/wonderful_umi.tab'
BC_SEQ_COLUMN: 1
BC_IDs_DEFAULT: '1-96'
UMI_START_POSITION: 0
UMI_LENGTH: 6
BC_START_POSITION: 6
BC_LENGTH: 6
## Tools ##
......@@ -9,7 +12,11 @@ BOWTIE2_INDEX_PREFIX: '/absolute/path/to/bowtie2_index'
BOWTIE2: '/absolute/path/to/bowtie2'
## Annotations ##
# Path to GTF/GFF file
GFF: '/absolute/path/to/wonderful.gtf.gz'
# Refer: http://htseq.readthedocs.io/en/master/count.html
FEATURE_ID: 'gene_name'
FEATURE_CONTENT: 'exon'
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC: 10
......
......@@ -27,6 +27,9 @@ SAMPLE_TABLE = cook_sample_sheet(SAMPLE_TABLE_FPATH) # ''
# '/ifs/data/yanailab/refs/barcodes/barcodes_cel-seq_umis96.tab'
BC_INDEX_FPATH = config.get('BC_INDEX_FPATH', None)
BC_IDs_DEFAULT = config.get('BC_IDs_DEFAULT', None) # '1-96'
BC_SEQ_COLUMN = config.get('BC_SEQ_COLUMN', None)
UMI_START_POSITION = config.get('UMI_START_POSITION', None)
BC_START_POSITION = config.get('BC_START_POSITION', None)
UMI_LENGTH = config.get('UMI_LENGTH', None) # 6
BC_LENGTH = config.get('BC_LENGTH', None) # 6
......@@ -38,6 +41,8 @@ BOWTIE2 = config.get('BOWTIE2', None) # '/local/apps/bowtie2/2.3.1/bowtie2'
## Annotations ##
# '/ifs/data/yanailab/refs/danio_rerio/danRer10_87/gtf/Danio_rerio.GRCz10.87.gtf.gz'
GFF = config.get('GFF', None)
FEATURE_ID = config.get('FEATURE_ID', 'gene_name')
FEATURE_CONTENT = config.get('FEATURE_CONTENT', 'exon')
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10
......@@ -185,8 +190,12 @@ rule combo_demultiplexing:
itemr1,
itemr2,
"--bc-index {}".format(BC_INDEX_FPATH),
"--bc-seq-column {}".format(BC_SEQ_COLUMN),
"--bc-index-used {}".format(itembc),
"--min-bc-quality {}".format(FASTQ_QUAL_MIN_OF_BC),
"--umi-start-position {}".format(
UMI_START_POSITION),
"--bc-start-position {}".format(BC_START_POSITION),
"--umi-length {}".format(UMI_LENGTH),
"--bc-length {}".format(BC_LENGTH),
"--cut-length {}".format(CUT_LENGTH),
......@@ -230,8 +239,8 @@ rule cook_annotation:
flag = touch('_done_annotation'),
message: 'Cooking Annotation'
run:
_ = cook_anno_model(GFF, feature_atrr='gene_id',
feature_type='exon',
_ = cook_anno_model(GFF, feature_atrr=FEATURE_ID,
feature_type=FEATURE_CONTENT,
stranded=True,
dumpto=output.anno,
verbose=verbose)
......
......@@ -18,6 +18,9 @@ install_requires = [
'pyyaml>=3.12, <4',
'HTSeq>=0.9',
'pytest==3.2.2',
'pandas>=0.20.0',
'numpy>=1.12.0',
'tables>=3.4.2',
]
# do not require installation if built by ReadTheDocs
......@@ -60,6 +63,7 @@ class CleanCommand(Command):
else:
os.system('rm -rf ./dist ./build ./*.egg-info ')
setup(
name=name,
......@@ -155,6 +159,10 @@ setup(
'celseq2.dummy_CELSeq2_reads:main'),
('celseq2-test = '
'celseq2.dummy_celseq2_test:main'),
('celseq2-diagnose = '
'celseq2.diagnose:main'),
('celseq2-to-st = '
'celseq2.support.st_pipeline:main'),
],
},
......
......@@ -52,6 +52,8 @@ def instance_demultiplex_stats(tmpdir_factory, instance_celseq2_data):
read2_fpath=str(r2_gz),
outdir=fdir,
dict_bc_id2seq=dict_bc_id2seq,
start_umi=0,
start_bc=6,
len_umi=6,
len_bc=6,
len_tx=35,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment