Commit 356dc327 authored by yy1533's avatar yy1533

🐶 delete legacy code

parent f03e1ffa
## Inforamtion ##
PROJECT_NAME: 'wonderful_sc'
DIR_PROJ: '/absolute/path/to/saved/dir'
## Sample ##
R1: '/absolute/path/to/r1.fastq.gz'
R2: '/absolute/path/to/r2.fastq.gz'
## Cell Barcodes ID
bc_ids_used: '1,2,3,4-5'
## CEL-seq2 Tech Setting ##
BC_INDEX_FPATH: '/absolute/path/to/wonderful_umi.tab'
BC_IDs_DEFAULT: '1-96'
UMI_LENGTH: 6
BC_LENGTH: 6
## Tools ##
BOWTIE2_INDEX_PREFIX: '/absolute/path/to/bowtie2_index'
BOWTIE2: '/absolute/path/to/bowtie2'
## Annotations ##
GFF: '/absolute/path/to/wonderful.gtf.gz'
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC: 10
CUT_LENGTH: 35
## Alignment ##
ALIGNER: 'bowtie2'
## UMI Count ##
ALN_QUAL_MIN: 0
## Running Parameters ##
num_threads: 5
verbose: True
######################################################################
## Dependencies ##
from celseq2.helper import mkfolder, join_path, base_name, print_logger
from celseq2.helper import is_nonempty_file, resetfolder, rmfolder
from celseq2.prepare_annotation_model import cook_anno_model
from celseq2.count_umi import count_umi
import pandas as pd
import glob
from pytools.persistent_dict import PersistentDict
## Inforamtion ##
PROJECT_NAME=config.get('PROJECT_NAME', None) # 'celseq2_demo'
DIR_PROJ=config.get('DIR_PROJ', None) # '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2'
## Sample Sheet ##
# SAMPLE_SHEET=config.get('SAMPLE_SHEET', None) # ''
R1=config.get('R1', None) # '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/data/7_S1_L001_R1_001.fastq.gz'
R2=config.get('R2', None) # '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/data/7_S1_L001_R2_001.fastq.gz'
## CEL-seq2 Tech Setting ##
BC_INDEX_FPATH=config.get('BC_INDEX_FPATH', None) # '/ifs/data/yanailab/refs/barcodes/barcodes_cel-seq_umis96.tab'
BC_IDs_DEFAULT=config.get('BC_IDs_DEFAULT', None) # '1-96'
UMI_LENGTH=config.get('UMI_LENGTH', None) # 6
BC_LENGTH=config.get('BC_LENGTH', None) # 6
## Tools ##
BOWTIE2_INDEX_PREFIX=config.get('BOWTIE2_INDEX_PREFIX', None) # '/ifs/data/yanailab/refs/danio_rerio/danRer10_87/genome/Danio_rerio.GRCz10.dna.toplevel'
BOWTIE2=config.get('BOWTIE2', None) # '/local/apps/bowtie2/2.3.1/bowtie2'
## Annotations ##
GFF=config.get('GFF', None) # '/ifs/data/yanailab/refs/danio_rerio/danRer10_87/gtf/Danio_rerio.GRCz10.87.gtf.gz'
## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC=config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10
CUT_LENGTH=config.get('CUT_LENGTH', None) # 35
## Alignment ##
ALIGNER=config.get('ALIGNER', None) # 'bowtie2'
## UMI Count ##
ALN_QUAL_MIN=config.get('ALN_QUAL_MIN', None) # 0
## Running Parameters ##
bc_ids_used=config.get('bc_ids_used', None) # '1,2,3,4-5'
num_threads=config.get('num_threads', None) # 5
verbose=config.get('verbose', None) # True
if not DIR_PROJ:
print_logger('Please specify configuration for pipeline\n')
exit(1)
if not R1 or not R2:
print_logger('Please specify reads file\n')
exit(1)
#######################
## Pipeline reserved ##
#######################
SUBDIR_FASTQ='smallfq'
SUBDIR_ALIGN='smallsam'
SUBDIR_EXPR ='expr'
SUBDIR_LOG='log'
SUBDIR_DIAG='smalldiagnose'
SUBDIR_ANNO='annotation'
SUBDIRS = [SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_EXPR,
SUBDIR_LOG, SUBDIR_DIAG, SUBDIR_ANNO]
aln_diagnose_item = ["_unmapped",
"_low_map_qual", '_multimapped', "_uniquemapped",
"_no_feature", "_ambiguous",
"_total"]
## Dev ##
storage = PersistentDict("mystorage")
#####################
## Snakemake rules ##
#####################
workdir: DIR_PROJ
rule all:
input:
# Expression Matrix
join_path(DIR_PROJ, SUBDIR_EXPR, PROJECT_NAME + '.csv'),
join_path(DIR_PROJ, SUBDIR_EXPR, PROJECT_NAME + '.pickle'),
# Diagnose of alignment
join_path(DIR_PROJ, SUBDIR_DIAG, PROJECT_NAME + '_alignment.csv')
run:
print_logger('Expression UMI matrix is saved at {}'.format(input[0]))
rule setup_dir:
output: SUBDIRS
run:
for i in output:
mkfolder(join_path(DIR_PROJ, i))
## Demultiplexing ##
rule demultiplexing:
input: R1, R2
output: dynamic(join_path(DIR_PROJ, SUBDIR_FASTQ, '{bc}.fastq'))
run:
outdir = join_path(DIR_PROJ, SUBDIR_FASTQ)
shell(
"""
bc_demultiplex \
{R1} \
{R2} \
--bc-index {BC_INDEX_FPATH} \
--bc-index-used {bc_ids_used} \
--min-bc-quality {FASTQ_QUAL_MIN_OF_BC} \
--umi-length {UMI_LENGTH} \
--bc-length {BC_LENGTH} \
--cut-length {CUT_LENGTH} \
--out-dir {outdir} \
--is-gzip \
""")
## Alignment ##
rule align_bowtie2:
input:
join_path(DIR_PROJ, SUBDIR_FASTQ, '{bc}.fastq')
output:
join_path(DIR_PROJ, SUBDIR_ALIGN, '{bc}.sam')
threads: num_threads
log:
join_path(DIR_PROJ, SUBDIR_LOG,
'Align-Bowtie2_Cell-{bc}.log')
shell:
"""
{BOWTIE2} \
-p {threads} \
-x {BOWTIE2_INDEX_PREFIX} \
-U {input} \
-S {output} 2>{log} \
--seed 42
"""
## HT-seq Count UMI ##
rule cook_annotation:
input: GFF
output:
anno=temp(join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.pickle'))
run:
v = cook_anno_model(GFF, feature_atrr='gene_id',
feature_type='exon',
stranded=True,
dumpto=output.anno,
verbose=verbose)
storage.store("anno", v)
rule umi_matrix:
input:
gff=join_path(DIR_PROJ, SUBDIR_ANNO,
base_name(GFF) + '.pickle'),
sam=dynamic(join_path(DIR_PROJ, SUBDIR_ALIGN, '{bc}.sam')),
output:
csv=join_path(DIR_PROJ, SUBDIR_EXPR,
PROJECT_NAME + '.csv'),
pickle=join_path(DIR_PROJ, SUBDIR_EXPR,
PROJECT_NAME + '.pickle'),
diagnose=join_path(DIR_PROJ, SUBDIR_DIAG,
PROJECT_NAME + '_alignment.csv'),
log:
join_path(DIR_PROJ, SUBDIR_LOG, 'umi_matrix.log')
run:
features_f = storage.fetch("anno")[0]
all_genes = sorted(list(storage.fetch("anno")[1]))
expr_dict = {}
aln_dict = {}
for s in input.sam:
cell = base_name(s)
print_logger('Counting cell {}'.format(cell))
umi_cnt, aln_cnt = count_umi(sam_fpath=s,
features=features_f,
len_umi=UMI_LENGTH,
accept_aln_qual_min=ALN_QUAL_MIN,
dumpto=None)
gnames = [x for x in umi_cnt]
expr_dict[cell] = pd.Series([umi_cnt[x] for x in gnames],
index=gnames)
aln_dict[cell] = pd.Series([aln_cnt[x] for x in aln_diagnose_item],
index=aln_diagnose_item)
expr_df = pd.DataFrame(expr_dict, index=all_genes).fillna(0)
expr_df.to_csv(output.csv)
expr_df.to_pickle(output.pickle)
diagnose_aln = pd.DataFrame(aln_dict, index=aln_diagnose_item).fillna(0)
diagnose_aln.to_csv(output.diagnose)
rule cleanall:
run:
for d in SUBDIRS:
rmfolder(join_path(DIR_PROJ, d))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment