Commit f49458a0 authored by Venkat Malladi's avatar Venkat Malladi

Add in spike-in analysis.

parent 650b34e0
......@@ -26,6 +26,7 @@ PARP-1 interacting snoRNA and DDX21
- tables/: Tables of RIP-seq binding
## Analysis of RNA-seq
- spike_in_correlation.py: Script to look at Spike-in counts to correlation
## General Analysis
......
#!/bin/bash
#SBATCH --job-name=ddx21_counts
#SBATCH --partition=super
#SBATCH --nodes=1
#SBATCH --time=0-24:00:00
#SBATCH --output=ddx21_counts.%j.out
#SBATCH --error=ddx21_counts.%j.err
#SBATCH --mail-user=venkat.malladi@utsouthwestern.edu
#SBATCH --mail-type=ALL
## Load the prerequisite modules
module load HTSeq/0.6.1
ANOTATION='/project/GCRB/shared/Gencode_human/release_19_ERCC_92/merge-annotation.sh-1.1.0/gencode.v19.annotation_ERCC_92.gtf'
htseq-count \
-f bam \
-m intersection-nonempty \
-i gene_id \
-r pos \
-s no \
/project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/LucKD/spike/align-tophat-se.sh-1.0.0/LucKD-1_S1_R1_001.fastq.gz.accepted_hits.bam $ANOTATION > /project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/htseq_counts/LucKD-1.counts
htseq-count \
-f bam \
-m intersection-nonempty \
-i gene_id \
-r pos \
-s no \
/project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/LucKD/spike/align-tophat-se.sh-1.0.0/LucKD-2_S5_R1_001.fastq.gz.accepted_hits.bam $ANOTATION > /project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/htseq_counts/LucKD-2.counts
htseq-count \
-f bam \
-m intersection-nonempty \
-i gene_id \
-r pos \
-s no \
/project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/DDX21_1KD_pos_Mut_DDX21/spike/align-tophat-se.sh-1.0.0/DDX21_1KD_pos_Mut_DDX21-1_S4_R1_001.fastq.gz.accepted_hits.bam $ANOTATION > /project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/htseq_counts/DDX21_1KD_pos_Mut_DDX21-1.counts
htseq-count \
-f bam \
-m intersection-nonempty \
-i gene_id \
-r pos \
-s no \
/project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/DDX21_1KD_pos_Mut_DDX21/spike/align-tophat-se.sh-1.0.0/DDX21_1KD_pos_Mut_DDX21_S8_R1_001.fastq.gz.accepted_hits.bam $ANOTATION > /project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/htseq_counts/DDX21_1KD_pos_Mut_DDX21-2.counts
htseq-count \
-f bam \
-m intersection-nonempty \
-i gene_id \
-r pos \
-s no \
/project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/DDX21_1KD/spike/align-tophat-se.sh-1.0.0/DDX21_1KD-2_S6_R1_001.fastq.gz.accepted_hits.bam $ANOTATION > /project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/htseq_counts/DDX21_1KD-2.counts
htseq-count \
-f bam \
-m intersection-nonempty \
-i gene_id \
-r pos \
-s no \
/project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/DDX21_1KD/spike/align-tophat-se.sh-1.0.0/DDX21_1KD-1_S2_R1_001.fastq.gz.accepted_hits.bam $ANOTATION > /project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/htseq_counts/DDX21_1KD-1.counts
htseq-count \
-f bam \
-m intersection-nonempty \
-i gene_id \
-r pos \
-s no \
/project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/DDX21_1KD_pos_WT_DDX21/spike/align-tophat-se.sh-1.0.0/DDX21_1KD_pos_WT_DDX21-2_S7_R1_001.fastq.gz.accepted_hits.bam $ANOTATION > /project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/htseq_counts/DDX21_1KD_pos_WT_DDX21-2.counts
htseq-count \
-f bam \
-m intersection-nonempty \
-i gene_id \
-r pos \
-s no \
/project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/DDX21_1KD_pos_WT_DDX21/spike/align-tophat-se.sh-1.0.0/DDX21_1KD_pos_WT_DDX21-1_S3_R1_001.fastq.gz.accepted_hits.bam $ANOTATION > /project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/htseq_counts/DDX21_1KD_pos_WT_DDX21-1.counts
#!/usr/bin/env python
# -*- coding: latin-1 -*-
'''Take an Counts file and plot Spikein'''
EPILOG = '''
For more details:
%(prog)s --help
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
import argparse
def get_args():
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument('-c', '--counts',
help="The file with counts values.",
required = True)
parser.add_argument('-f','--factor',
help="Factor that is being analyzed.",
required = True)
args = parser.parse_args()
return args
def main():
args = get_args()
spike_in = pd.read_csv('ERCC_spikein/spike_in_concentrations.tsv', sep='\t')
counts = pd.read_csv(args.counts, sep='\t')
spike_in_counts = pd.merge(counts,spike_in ,on='ERCC ID')
spike_in_counts.columns = ['ERCC ID', 'Counts', 'Re-sort ID','subgroup','Mix1', 'Mix2', 'fold-change','mix1/mix2']
test = pd.DataFrame()
test['Counts'] = np.log2(spike_in_counts['Counts']+1)
test['Mix1'] = np.log2(spike_in_counts['Mix1'])
x = test['Mix1'].values
y = test['Counts'].values
x = x.reshape(92, 1)
y = y.reshape(92, 1)
regr = linear_model.LinearRegression()
regr.fit(x,y)
cor = regr.coef_
test.plot.scatter(x='Mix1', y='Counts')
plt.plot(x, regr.predict(x), color='blue',linewidth=3,label="Correlation r = %.4f"%(cor))
plt.legend(loc=2)
plt.xlabel('log2(Mix1)')
plt.ylabel('log2(Counts+1)')
plt.savefig('figures/' + args.factor + '_counts.png')
plt.clf()
if __name__ == '__main__':
main()
# Spikein data cleanup
for i in $(ls /Volumes/project/GCRB/Lee_Lab/s163035/MCF7_DDX21_inducibile_kd_Daeseok/htseq_counts/*counts);
do raw_fn=$(basename "${i}");
raw_prefix=${raw_fn%.counts};
echo -e "ERCC ID\tCounts" > test.counts;
grep "gSpikein_ERCC" $i >> test.counts;
sed -i '' 's/gSpikein_ERCC/ERCC/' test.counts;
./spike_in_correlation.py -c test.counts -f $raw_fn;
done
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment