Commit 911f02aa authored by Puriney's avatar Puriney
Browse files

🍻 release demultiplexing

parents
# project-specific
/.vscode/
*/data/
# sphinx docs
/docs/build/
# dropbox
/.dropbox
# vim
[._]*.s[a-w][a-z]
[._]s[a-w][a-z]
*.un~
Session.vim
.netrwhist
*~
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
Yun Yan (<yy1533@nyu.edu>)
Florian Wagner (<florian.wagner@nyu.edu>)
\ No newline at end of file
# CEL-Seq2 bioinformatics pipeline
# Dependencies
# Install
# Quick Start
# Documents
# coding: utf-8
from collections import Counter
import csv
import os
import argparse
from celseq2.helper import filehandle_fastq_gz, print_logger
def bc_dict_seq2id(bc_index_fpath):
""" dict[barcode_seq] = barcode_id """
out = dict()
with open(bc_index_fpath, 'rt') as fin:
freader = csv.reader(fin, delimiter='\t')
next(freader)
out = {row[1]: int(row[0]) for row in freader}
return(out)
def bc_dict_id2seq(bc_index_fpath):
out = dict()
with open(bc_index_fpath, 'rt') as fin:
freader = csv.reader(fin, delimiter='\t')
next(freader)
out = {row[0]: int(row[1]) for row in freader}
return(out)
def demultiplexing(read1_fpath, read2_fpath, dict_bc_seq2id,
outdir,
len_umi=6, len_bc=6, len_tx=35, bc_qual_min=10,
is_gzip=True,
do_bc_rev_complement=False,
do_tx_rev_complement=False,
verbose=False):
"""
Demultiplexing to fastq files based on barcode sequence.
"""
if is_gzip:
fh_umibc = filehandle_fastq_gz(read1_fpath)
fh_tx = filehandle_fastq_gz(read2_fpath)
else:
fh_umibc = open(read1_fpath, 'rt')
fh_tx = open(read2_fpath, 'rt')
sample_counter = Counter()
bc_fhout = dict()
for bc_seq, bc_id in dict_bc_seq2id.items():
bc_fhout[bc_seq] = os.path.join(outdir, 'BC-{}-{}.fastq'.format(bc_id,
bc_seq))
bc_fhout['UNKNOWNBC_R1'] = os.path.join(outdir, 'UNKNOWNBC_R1.fastq')
bc_fhout['UNKNOWNBC_R2'] = os.path.join(outdir, 'UNKNOWNBC_R2.fastq')
for bc_seq, v in bc_fhout.items():
bc_fhout[bc_seq] = open(v, 'w')
i = 0
while(True):
if verbose and i % 1000000 == 0:
print_logger('Processing {:,} reads...'.format(i))
try:
umibc_name = next(fh_umibc).rstrip()
umibc_seq = next(fh_umibc).rstrip()
next(fh_umibc)
umibc_qualstr = next(fh_umibc).rstrip()
tx_name = next(fh_tx).rstrip()
tx_seq = next(fh_tx).rstrip()
next(fh_tx)
tx_qualstr = next(fh_tx).rstrip()
i += 1
except StopIteration:
break
# Quality check? or user should feed good files
# if not (umibc_name and umibc_seq and umibc_qualstr and tx_name and tx_seq and tx_qualstr):
# raise Exception('FastQError: Possible Broken Fastq. Check pair-{}.\n'.format(i+1))
# if len(umibc_seq) != len(umibc_qualstr) or len(tx_seq) != len(tx_qualstr):
# raise Exception('FastQError: Possible multi-line Fastq. Convert to 4-line please.\n')
# if umibc_name.split()[0] != tx_name.split()[0]:
# raise Exception('FastQError: Reads are not paired at pair-{}.\n'.format(i+1))
sample_counter['total'] += 1
if len(umibc_seq) < len_umi + len_bc:
continue
umibc_min_qual = min((ord(c)-33 for c in umibc_qualstr[:(len_umi + len_bc)]))
if umibc_min_qual < bc_qual_min:
continue
sample_counter['qualified'] += 1
umi, cell_bc = umibc_seq[0:len_umi], umibc_seq[len_umi:(len_umi + len_bc)]
try:
fhout = bc_fhout[cell_bc]
except KeyError:
fhout = bc_fhout['UNKNOWNBC_R1']
fhout.write('{}\n{}\n{}\n{}\n'.format(umibc_name, umibc_seq,
"+", umibc_qualstr))
fhout = bc_fhout['UNKNOWNBC_R2']
fhout.write('{}\n{}\n{}\n{}\n'.format(tx_name, tx_seq,
"+", tx_qualstr))
sample_counter['unknown'] += 1
continue
# if len(tx_seq) < len_tx:
# continue
if len(tx_seq) > len_tx:
tx_seq, tx_qualstr = tx_seq[:len_tx], tx_qualstr[:len_tx]
read_name = '@BC-{}_UMI-{}'.format(cell_bc, umi)
fhout.write('{}\n{}\n{}\n{}\n'.format(read_name, tx_seq, "+", tx_qualstr))
sample_counter[cell_bc] += 1
sample_counter['saved'] += 1
sample_counter['unqualified'] = sample_counter['total'] - sample_counter['qualified']
for _, v in bc_fhout.items():
v.close()
fh_umibc.close()
fh_tx.close()
return(sample_counter)
def write_demultiplexing(stats, dict_bc_seq2id, stats_fpath):
if stats_fpath is None:
stats_fpath = os.path.join('demultiplexing.log')
try:
fh_stats = open(stats_fpath, 'w')
except:
raise
fh_stats.write('BC\tReads(#)\tReads(%)\n')
for k, v in dict_bc_seq2id.items():
fh_stats.write('{:03d}-{}\t{:,}\t{:06.2f}\n'.format(v, k, stats[k],
stats[k]/stats['total']*100))
fh_stats.write('{}\t{}\t{:06.2f}\n'.format('saved',
stats['saved'],
stats['saved']/stats['total']*100))
fh_stats.write('{}\t{}\t{:06.2f}\n'.format('unknown',
stats['unknown'],
stats['unknown']/stats['total']*100))
fh_stats.write('{}\t{}\t{:06.2f}\n'.format('qualified',
stats['qualified'],
stats['qualified']/stats['total']*100))
fh_stats.write('{}\t{}\t{:06.2f}\n'.format('unqualified',
stats['unqualified'],
stats['unqualified']/stats['total']*100))
fh_stats.write('{}\t{}\t{:06.2f}\n'.format('total',
stats['total'],
stats['total']/stats['total']*100))
def main():
parser = argparse.ArgumentParser(description= __doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
# parser.add_argument('sample_sheet', type=str)
parser.add_argument('read1_fpath', type=str)
parser.add_argument('read2_fpath', type=str)
parser.add_argument('--bc_index', type=str, metavar='FILENAME',
help='File path to barcode used')
parser.add_argument('--min-bc-quality', metavar='N', type=int, default=10,
help='Minimal quality for barcode reads (default=10)')
parser.add_argument('--out-dir', metavar='DIRNAME', type=str, default='.',
help='Output directory. Defaults to current directory')
parser.add_argument('--is-gzip', dest='is_gzip', action='store_true')
parser.add_argument('--not-gzip', dest='is_gzip', action='store_false')
parser.set_defaults(is_gzip=True)
parser.add_argument('--stats-file', metavar='STATFILE',
type=str, default='demultiplexing.log',
help='Statistics (default: demultiplexing.log)')
parser.add_argument('--umi-length', metavar='N', type=int, default=0,
help='Length of UMI (default=0, e.g. no UMI)')
parser.add_argument('--bc-length', metavar='N', type=int, default=8,
help='Length of CELSeq barcode (default=8)')
parser.add_argument('--cut-length', metavar='N', type=int, default=35,
help='Length of mapped read (default=35)')
parser.add_argument('--verbose', dest='verbose', action='store_true')
parser.set_defaults(verbose=False)
args = parser.parse_args()
bc_dict = bc_dict_seq2id(args.bc_index)
print_logger('Demultiplexing starts ...')
out = demultiplexing(read1_fpath=args.read1_fpath,
read2_fpath=args.read2_fpath,
outdir=args.out_dir, dict_bc_seq2id=bc_dict,
len_umi=args.umi_length,
len_bc=args.bc_length,
len_tx=args.cut_length,
bc_qual_min=args.min_bc_quality,
is_gzip=args.is_gzip,
do_bc_rev_complement=False,
do_tx_rev_complement=False,
verbose=args.verbose)
print_logger('Demultiplexing ends ...')
write_demultiplexing(out, bc_dict, args.stats_file)
if __name__ == "__main__":
main()
#!/usr/bin/env/python3
# -*- coding: utf-8 -*-
"""
@contact: Yun Yan (yy1533@nyu.edu)
"""
import os
import io
import time
import shutil
import subprocess
import sys
# import yaml
def join_path(*args):
x = map(str, args)
return(os.path.join(*x))
def paste(*args, sept=' '):
x = map(str, args)
return(sept.join(x))
def paste0(*args):
return(paste(*args, sept=""))
def ymdhms():
return(time.strftime('%Y%m%d%H%M%S%Z', time.gmtime()))
def print_logger(msg):
localtime = time.asctime(time.localtime(time.time()))
sys.stderr.write("[ {} ] {}\n".format(localtime, msg))
def mkfolder(dirpath):
try:
os.makedirs(dirpath)
except OSError:
pass
def resetfolder(dirpath):
if not os.path.isdir(dirpath):
raise OSError('Not a directory')
try:
shutil.rmtree(dirpath)
os.makedirs(dirpath)
except OSError:
raise OSError(paste("Fail to reset folder", dirpath))
return(None)
def is_nonempty_file(fpath, verbose=False):
if os.path.exists(fpath):
if os.path.isfile(fpath):
try:
return(os.path.getsize(fpath) != 0)
except OSError:
return(False)
else:
if verbose: print_logger(paste(fpath, "is not a legal file"))
return(False)
else:
if verbose: print_logger(paste(fpath, " does not exist or not-accessed"))
return(False)
def resetfpath(fpath):
if is_nonempty_file(fpath):
os.remove(fpath)
return(None)
def base_name(fpath, ext=None):
bs = os.path.basename(fpath)
if not (ext is None or ext == ""):
bs.replace(ext, '')
bs = os.path.splitext(bs)[0]
return(bs)
def popen_communicate(cmd):
try:
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
except ValueError:
print_logger(paste0("Fail to run: ", cmd, "\n"))
return(None)
(pout, perr) = p.communicate()
return(pout)
def filehandle_fastq_gz(fpath):
# pout = popen_communicate('zcat {}'.format(fpath))
# fh = io.BytesIO(pout)
p = subprocess.Popen('gunzip -c {}'.format(fpath), shell=True, stdout=subprocess.PIPE)
fh = io.TextIOWrapper(p.stdout, encoding='ascii')
return(fh)
def main():
print_logger("This is function: helper.py")
if __name__ == '__main__':
main()
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from __future__ import print_function, division\n",
"\n",
"import csv\n",
"import os\n",
"import argparse\n",
"\n",
"from itertools import izip\n",
"from HTSeq import FastqReader\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%reload_ext line_profiler\n",
"import line_profiler"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def bc_dict_seq2id(bc_index_fpath):\n",
" \"\"\" dict[barcode_seq] = barcode_id \"\"\"\n",
" out = dict()\n",
" with open(bc_index_fpath, 'rb') as fin:\n",
" freader = csv.reader(fin, delimiter='\\t')\n",
" next(freader)\n",
" out = {row[1]: int(row[0]) for row in freader}\n",
" return(out)\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def demultiplexing(read1_fpath, read2_fpath, outdir, bc_dict,\n",
" len_umi=6, len_bc=6, len_tx=35, bc_qual_min=10,\n",
" do_bc_rev_complement=False,\n",
" do_tx_rev_complement=False,\n",
" verbose=False):\n",
" \"\"\"\n",
" Demultiplexing to fastq files based on\n",
" barcode sequence.\n",
" \"\"\"\n",
" fh_umibc = FastqReader(read1_fpath)\n",
" fh_tx = FastqReader(read2_fpath)\n",
"\n",
" sample_counter = Counter()\n",
"\n",
" bc_fhout = dict()\n",
" for bc_seq, bc_id in bc_dict.iteritems(): # py2\n",
" bc_fhout[bc_seq] = os.path.join(outdir, 'BC_{}-{}.fastq'.format(bc_id,\n",
" bc_seq))\n",
" bc_fhout['UNKNOWNBC_R1'] = os.path.join(outdir, 'UNKNOWNBC_R1.fastq')\n",
" bc_fhout['UNKNOWNBC_R2'] = os.path.join(outdir, 'UNKNOWNBC_R2.fastq')\n",
"\n",
" for bc_seq, v in bc_fhout.items():\n",
" bc_fhout[bc_seq] = open(v, 'wb')\n",
"\n",
" for i, (read_umibc, read_tx) in enumerate(izip(fh_umibc, fh_tx)):\n",
" sample_counter['total'] += 1\n",
" if len(read_umibc) < len_umi + len_bc:\n",
" continue\n",
"\n",
" if min(read_umibc.qual[:(len_umi + len_bc)]) < bc_qual_min:\n",
" continue\n",
"\n",
" sample_counter['qualified'] += 1\n",
" cell_bc = read_umibc.seq[len_umi:(len_umi + len_bc)]\n",
" umi = read_umibc.seq[0:len_umi]\n",
" try:\n",
" fhout = bc_fhout[cell_bc]\n",
" if len(read_tx) > len_tx:\n",
" read_tx = read_tx[:len_tx]\n",
"\n",
" read_name = read_tx.name.split()[0] + ':UMI:{}'.format(umi)\n",
" read_tx.name = read_name\n",
"\n",
" read_tx.write_to_fastq_file(fhout)\n",
" sample_counter['saved'] += 1\n",
" except KeyError as e:\n",
" fhout = bc_fhout['UNKNOWNBC_R1']\n",
" read_umibc.write_to_fastq_file(fhout)\n",
" fhout = bc_fhout['UNKNOWNBC_R2']\n",
" read_tx.write_to_fastq_file(fhout)\n",
" sample_counter['unknown'] += 1\n",
" sample_counter['unqualified'] = sample_counter['total'] - sample_counter['qualified']\n",
" for _, v in bc_fhout.items():\n",
" v.close()\n",
" if verbose:\n",
" print(sample_counter)\n",
" return(sample_counter)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Main Run"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"bc_index_fpath='/ifs/data/yanailab/refs/barcodes/barcodes_cel-seq_umis96.tab'\n",
"r1_fpath='/ifs/home/yy1533/Lab/cel-seq-pipe/demo/data/7_S1_L001_R1_001.fastq.1M.gz'\n",
"r2_fpath='/ifs/home/yy1533/Lab/cel-seq-pipe/demo/data/7_S1_L001_R2_001.fastq.1M.gz'\n",
"outdir='/ifs/home/yy1533/Lab/cel-seq-pipe/demo/bc_split_Y'"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"bc_dict = bc_dict_seq2id(bc_index_fpath)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%lprun -f demultiplexing demultiplexing(read1_fpath=r1_fpath, read2_fpath=r2_fpath,\\\n",
" outdir=outdir, bc_dict=bc_dict,\\\n",
" len_umi=6, len_bc=6, len_tx=35, bc_qual_min=10,\\\n",
" do_bc_rev_complement=False, do_tx_rev_complement=False,verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},