gct_serum_mirna_analysis.py

"""
Author: John Lafin
Email: lafinj@gmail.com

This script is built to accept a CSV file
with three columns with headers (in order): Target, Sample, and Cq.
It expects that the extraction efficiency target is named 'cel-miR-39'
and that the loading control target is named 'miR-30b'.
It will output one file containing the final dataframe with mean
raw Cq, normalized Cq, dCq, ddCq, and Rq values.
"""

import pandas as pd
import sys

# read in miRNA panel data
# first two columns construct multiindex
in_path = sys.argv[1]
out_path = in_path.strip('.csv')  + '_out.csv'
relative = input('What is the name of your control?')
data = pd.read_csv(in_path, index_col = ['Target', 'Sample'])

# ensure that levels are read in as string instead of int
data.index = data.index.set_levels(data.index.levels[0].astype(str), level = 0)
data.index = data.index.set_levels(data.index.levels[1].astype(str), level = 1)

# convert NANs to Ct = 40
data.fillna(40, inplace = True)

# group the data together by target and sample
grouped = data.groupby(level = ['Target', 'Sample'], sort=False)

# find the mean of each group
means = grouped.mean()

# Step 1: determine the sample with the lowest cel-miR-39 Cq
# and subtract this from all mean cel-miR-39 Cqs
cel_norm = means.loc['cel-miR-39'].min()
norm_means = means.loc['cel-miR-39'] - cel_norm

# Step 2: subtract extraction correction (calculated above)
# from all mean miR-30b Cqs and add as a column to data
hk_norm = means.loc['miR-30b'] - norm_means
means = means.join(hk_norm, rsuffix = "_norm")

# Step 3: normalize the GCT biomarker miRNAs to miR-30b

means['dCq'] = means['Cq'] - means['Cq_norm']

# Step 4: calculate ddCq and Rq

means['ddCq'] = means.apply(
    lambda x: x['dCq'] - means.xs([x.name[0],'normal'])['dCq'],axis=1)

means['Rq'] = 2 ** (-means['ddCq'])

# export to file
means.sort_index().to_csv(out_path)