Unverified Commit b62cd1d2 authored by Yun YAN's avatar Yun YAN Committed by GitHub
Browse files

Merge pull request #12 from Puriney/master

🇨🇳 Upgrade to 0.4.6
parents 6545a9a4 66194ddc
from collections import OrderedDict
import re
def parse_bowtie2_report(raw_data):
"""
Parse log file of Bowtie2 (singl-end reads ONLY)
Adapted from MultiQC (https://github.com/ewels/MultiQC/blob/b885947a2112be94774a98bc6469026f23f1e36e/multiqc/modules/bowtie2/bowtie2.py)
Example:
18 reads; of these:
18 (100.00%) were unpaired; of these:
0 (0.00%) aligned 0 times
14 (77.78%) aligned exactly 1 time
4 (22.22%) aligned >1 times
100.00% overall alignment rate
"""
# Regexes
regexes = {
'unpaired': {
'total_reads': r"(\d+) reads; of these:",
'total_unpaired': r"(\d+) \([\d\.]+%\) were unpaired; of these:",
'unmapped': r"(\d+) \([\d\.]+%\) aligned 0 times",
'uniquely_mapped': r"(\d+) \([\d\.]+%\) aligned exactly 1 time",
'multi_mapped': r"(\d+) \([\d\.]+%\) aligned >1 times",
'overall_alignment_percent': r"([\d\.]+)% overall alignment rate",
},
'paired': {
'paired_aligned_none': r"(\d+) \([\d\.]+%\) aligned concordantly 0 times",
'paired_aligned_one': r"(\d+) \([\d\.]+%\) aligned concordantly exactly 1 time",
'paired_aligned_multi': r"(\d+) \([\d\.]+%\) aligned concordantly >1 times",
'paired_aligned_discord_one': r"(\d+) \([\d\.]+%\) aligned discordantly 1 time",
'paired_aligned_discord_multi': r"(\d+) \([\d\.]+%\) aligned discordantly >1 times",
'paired_aligned_mate_one': r"(\d+) \([\d\.]+%\) aligned exactly 1 time",
'paired_aligned_mate_multi': r"(\d+) \([\d\.]+%\) aligned >1 times",
'paired_aligned_mate_none': r"(\d+) \([\d\.]+%\) aligned 0 times"
}
}
parsed_data = OrderedDict({k: 0 for k in regexes['unpaired'].keys()})
for k, r in regexes['unpaired'].items():
r_search = re.search(r, raw_data, re.MULTILINE)
if r_search:
parsed_data[k] = float(r_search.group(1))
assert parsed_data['total_reads'] == parsed_data['total_unpaired'] # single-end
return(parsed_data)
def parse_star_report(raw_data):
"""
Parse the STAR log file.
Copied from MultiQC (https://github.com/ewels/MultiQC/blob/b885947a2112be94774a98bc6469026f23f1e36e/multiqc/modules/star/star.py)
"""
regexes = {
'total_reads': r"Number of input reads \|\s+(\d+)",
'avg_input_read_length': r"Average input read length \|\s+([\d\.]+)",
'uniquely_mapped': r"Uniquely mapped reads number \|\s+(\d+)",
'uniquely_mapped_percent': r"Uniquely mapped reads % \|\s+([\d\.]+)",
'avg_mapped_read_length': r"Average mapped length \|\s+([\d\.]+)",
'num_splices': r"Number of splices: Total \|\s+(\d+)",
'num_annotated_splices': r"Number of splices: Annotated \(sjdb\) \|\s+(\d+)",
'num_GTAG_splices': r"Number of splices: GT/AG \|\s+(\d+)",
'num_GCAG_splices': r"Number of splices: GC/AG \|\s+(\d+)",
'num_ATAC_splices': r"Number of splices: AT/AC \|\s+(\d+)",
'num_noncanonical_splices': r"Number of splices: Non-canonical \|\s+(\d+)",
'mismatch_rate': r"Mismatch rate per base, % \|\s+([\d\.]+)",
'deletion_rate': r"Deletion rate per base \|\s+([\d\.]+)",
'deletion_length': r"Deletion average length \|\s+([\d\.]+)",
'insertion_rate': r"Insertion rate per base \|\s+([\d\.]+)",
'insertion_length': r"Insertion average length \|\s+([\d\.]+)",
'multimapped': r"Number of reads mapped to multiple loci \|\s+(\d+)",
'multimapped_percent': r"% of reads mapped to multiple loci \|\s+([\d\.]+)",
'multimapped_toomany': r"Number of reads mapped to too many loci \|\s+(\d+)",
'multimapped_toomany_percent': r"% of reads mapped to too many loci \|\s+([\d\.]+)",
'unmapped_mismatches_percent': r"% of reads unmapped: too many mismatches \|\s+([\d\.]+)",
'unmapped_tooshort_percent': r"% of reads unmapped: too short \|\s+([\d\.]+)",
'unmapped_other_percent': r"% of reads unmapped: other \|\s+([\d\.]+)",
}
parsed_data = OrderedDict({k: 0 for k in regexes.keys()})
for k, r in regexes.items():
r_search = re.search(r, raw_data, re.MULTILINE)
if r_search:
parsed_data[k] = float(r_search.group(1))
# Figure out the numbers for unmapped as for some reason only the percentages are given
try:
total_mapped = parsed_data['uniquely_mapped'] + \
parsed_data['multimapped'] + parsed_data['multimapped_toomany']
unmapped_count = parsed_data['total_reads'] - total_mapped
total_unmapped_percent = parsed_data['unmapped_mismatches_percent'] + \
parsed_data['unmapped_tooshort_percent'] + \
parsed_data['unmapped_other_percent']
try:
parsed_data['unmapped_mismatches'] = int(round(
unmapped_count * (parsed_data['unmapped_mismatches_percent'] / total_unmapped_percent), 0))
parsed_data['unmapped_tooshort'] = int(round(
unmapped_count * (parsed_data['unmapped_tooshort_percent'] / total_unmapped_percent), 0))
parsed_data['unmapped_other'] = int(round(
unmapped_count * (parsed_data['unmapped_other_percent'] / total_unmapped_percent), 0))
except ZeroDivisionError:
parsed_data['unmapped_mismatches'] = 0
parsed_data['unmapped_tooshort'] = 0
parsed_data['unmapped_other'] = 0
except KeyError:
pass
if len(parsed_data) == 0:
return None
return parsed_data
def merge_reports(reports, report_names=None,
aligner_name = None,
savetocsv='report.csv'):
""" Merge a list of reports and save as a CSV file """
if not reports:
return
n = len(reports)
if not report_names:
report_names = [str(i + 1) for i in range(n)]
assert len(reports) == len(report_names)
if (not aligner_name) or (str(aligner_name) not in ('bowtie2', 'star')):
aligner_name = 'X'
features = list(reports[0].keys())
with open(savetocsv, 'w') as fout:
fout.write('{}\n'.format(','.join([str(aligner_name)] + features)))
for i in range(n):
i_name = report_names[i]
i_values = list(reports[i].values())
fout.write('{}\n'.format(
','.join(map(str, [i_name] + i_values))))
return savetocsv
if __name__ == '__main__':
print('This is parse_log.py')
\ No newline at end of file
__version__ = '0.4.4'
__version__ = '0.4.6'
......@@ -5,6 +5,7 @@ from celseq2.helper import rmfolder, mkfolder
from celseq2.helper import cook_sample_sheet, popen_communicate
from celseq2.prepare_annotation_model import cook_anno_model
from celseq2.count_umi import count_umi, _flatten_umi_set
from celseq2.parse_log import parse_bowtie2_report, parse_star_report, merge_reports
import pandas as pd
import glob
import pickle
......@@ -12,6 +13,8 @@ import pickle
from collections import Counter, defaultdict
from multiprocessing import Pool, cpu_count
import re
import tempfile
import shutil
import os
......@@ -91,9 +94,12 @@ SUBDIR_LOG = 'small_log'
SUBDIR_QSUB = 'qsub_log'
SUBDIR_DIAG = 'small_diagnose'
SUBDIR_ANNO = 'annotation'
SUBDIR_REPORT = 'report'
SUBDIRS = [SUBDIR_INPUT,
SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_UMI_CNT, SUBDIR_UMI_SET,
SUBDIR_EXPR,
SUBDIR_REPORT,
SUBDIR_LOG, SUBDIR_QSUB, SUBDIR_DIAG, SUBDIR_ANNO
]
......@@ -109,15 +115,22 @@ aln_diagnose_item = ["_unmapped",
workdir: DIR_PROJ
rule Count_Matrix:
rule all:
input:
csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'),
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
# alignment = expand(join_path(DIR_PROJ, SUBDIR_DIAG,
# '{itemid}', 'alignment_diagnose.csv'),
# itemid=item_names),
report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}',
'alignment_'+ALIGNER+'.csv'), itemid=item_names),
rule COUNT_MATRIX:
input:
csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'),
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
output:
touch('_done_UMI')
message: 'Finished counting UMI-count matrix.'
......@@ -125,11 +138,13 @@ rule Count_Matrix:
if ALIGNER == 'star':
shell('rm {}'.format('_done_star_genome_loaded'))
print('Free memory loaded by STAR', flush=True)
cmd = 'STAR '
cmd += '--genomeLoad Remove '
cmd += '--genomeDir {STAR_INDEX_DIR} '
shell(cmd)
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = 'STAR '
cmd += '--genomeLoad Remove '
cmd += '--genomeDir {STAR_INDEX_DIR} '
cmd += '--outFileNamePrefix '
cmd += join_path(tmpdirname, '')
shell(cmd)
print_logger('UMI-count matrix is saved at {}'.format(input.csv))
......@@ -293,22 +308,36 @@ if ALIGNER == 'bowtie2':
--seed 42
""")
rule parse_bowtie2_log:
input:
log = join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}',
'Align-Bowtie2_Cell-{bc}.log'),
output:
report = join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}', ALIGNER,
'{bc}.pickle'),
run:
with open(input.log, 'r') as fin:
log_content = fin.readlines()
df = parse_bowtie2_report('\t'.join(log_content))
pickle.dump(df, open(output.report, 'wb'))
if ALIGNER == 'star':
assert STAR_INDEX_DIR
rule star_load_genome:
input:
flag = '_done_annotation',
output:
touch('_done_star_genome_loaded')
flag = '_done_star_genome_loaded',
message: 'Loading genome to memory for STAR'
shadow: "shallow"
run:
cmd = 'STAR '
cmd += '--genomeLoad LoadAndExit '
cmd += '--genomeDir {STAR_INDEX_DIR} '
cmd += '&& echo loaded >> {output.flag} '
shell(cmd)
# shell('touch {output.flag} ')
# shell('echo loaded >> {output.flag} ')
if ALIGNER == 'star':
assert STAR
......@@ -320,11 +349,13 @@ if ALIGNER == 'star':
sam = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemid}', '{bc}.sam'),
starsam = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemid}', '{bc}',
'Aligned.out.sam'),
starlog = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemid}', '{bc}',
'Log.final.out'),
params:
threads = num_threads
star_prefix = join_path(DIR_PROJ, SUBDIR_ALIGN,
'{itemid}', '{bc}', ''),
threads = num_threads,
run:
star_prefix = join_path(
DIR_PROJ, SUBDIR_ALIGN, wildcards.itemid, wildcards.bc, '')
cmd = 'STAR '
cmd += ' --runRNGseed 42 '
cmd += ' --genomeLoad LoadAndKeep '
......@@ -332,12 +363,60 @@ if ALIGNER == 'star':
cmd += ' --genomeDir {STAR_INDEX_DIR} '
# cmd += ' --readFilesCommand zcat '
cmd += ' --readFilesIn {input.fq} '
cmd += ' --outFileNamePrefix {star_prefix} '
cmd += ' --outFileNamePrefix {params.star_prefix} '
# cmd += ' --outSAMmultNmax 1 '
shell(cmd)
shell('ln -s {output.starsam} {output.sam} ')
shell('touch -h {output.sam} ')
rule parse_star_log:
input:
starlog = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemid}', '{bc}',
'Log.final.out'),
output:
report = join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}', ALIGNER,
'{bc}.pickle'),
run:
with open(input.starlog, 'r') as fin:
log_content = fin.readlines()
df = parse_star_report('\t'.join(log_content))
pickle.dump(df, open(output.report, 'wb'))
rule REPORT_ALIGNMENT_LOG:
input:
report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}',
'alignment_'+ALIGNER+'.csv'),
itemid=item_names),
rule report_alignment_log:
input:
'_done_umimatrix_per_experiment',
df = dynamic(join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}', ALIGNER,
'{bc}.pickle')),
output:
report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}',
'alignment_'+ALIGNER+'.csv'), itemid=item_names),
run:
for item in item_names:
logs_per_item = []
logs_name_item = []
report_item = join_path(DIR_PROJ, SUBDIR_REPORT, item,
'alignment_'+ALIGNER+'.csv')
logs_fpath_item = [x for x in input.df if item in x]
for log_fpath in logs_fpath_item:
log_df = pickle.load(open(log_fpath, 'rb'))
logs_per_item.append(log_df)
log_name = base_name(log_fpath)
logs_name_item.append(log_name)
_ = merge_reports(reports=logs_per_item,
report_names=logs_name_item,
aligner_name=ALIGNER,
savetocsv=report_item)
rule count_umi:
input:
......@@ -420,6 +499,7 @@ rule summarize_umi_matrix_per_experiment:
expid=list(set(sample_list))),
hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'),
expid=list(set(sample_list))),
flag = '_done_umimatrix_per_experiment',
run:
_, all_genes = pickle.load(open(input.gff, 'rb'))
all_genes = sorted(all_genes)
......@@ -458,7 +538,7 @@ rule summarize_umi_matrix_per_experiment:
expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR,
exp_id, 'expr.h5'), 'table')
shell('touch _done_umimatrix_per_experiment')
shell('touch {output.flag} ')
rule summarize_alignment_diagnose:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment