diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml index 87ce3ba5492d9cb6fb649413a1227c0bd5242883..1f3c6506a9db50a75c9f6fa91e23822e800ddd76 100644 --- a/workflow/conf/multiqc_config.yaml +++ b/workflow/conf/multiqc_config.yaml @@ -70,7 +70,7 @@ custom_data: meta: file_format: 'tsv' section_name: 'Metadata' - description: 'This is the comparison of infered metadata and submitter provided' + description: 'This is the comparison of infered metadata, submitter provided, and calculated' plot_type: 'table' pconfig: id: 'meta' @@ -80,6 +80,8 @@ custom_data: Ends Stranded Spike-in + Read Length + TIN tin: file_format: 'tsv' section_name: 'TIN' diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index e09a1d7f1d2a951dceed0050a4db79bde951cf48..4de1e7409828d791e5f323ee74715bc05237fc81 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -241,7 +241,7 @@ process parseMetadata { readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p readLength) if [ "\${readLength}" == "nan"] then - readLength="Not Entered" + readLength="NA" fi echo -e "LOG: read length metadata parsed: \${readLength}" >> ${repRID}.parseMetadata.log @@ -317,7 +317,7 @@ process trimData { """ } -// Split metadata into separate channels +// Extract calculated read length metadata into channel readLengthInfer = Channel.create() inferMetadata_readLength.splitCsv(sep: ",", header: false).separate( readLengthInfer @@ -919,7 +919,8 @@ process dataQC { val ends from endsInfer_dataQC output: - path "${repRID}.tin.hist.tsv" into tin + path "${repRID}.tin.hist.tsv" into tinHist + path "${repRID}.tin.med.csv" into tinMed path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance script: @@ -952,6 +953,11 @@ process dataQC { """ } +// Extract median TIN into channel +tinMedInfer = Channel.create() +tinMed.splitCsv(sep: ",", header: false).separate( + tinMedInfer, + /* *aggrQC: aggregate QC from processes as well as metadata and run MultiQC */ @@ -968,7 +974,7 @@ process aggrQC { path dedupQC path countsQC path innerDistance - path tin + path tinHist path alignSampleQCs from alignSampleQC_aggrQC.collect() path inferExperiment val endsManual from endsManual_aggrQC @@ -982,6 +988,7 @@ process aggrQC { val speciesI from speciesInfer_aggrQC val readLengthM from readLengthMeta val readLengthI from readLengthInfer + val tinMedI from tinMedInfer val expRID val studyRID @@ -1000,11 +1007,11 @@ process aggrQC { # make metadata table echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log - echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRead Length" > metadata.tsv - echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-" >> metadata.tsv - echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t${readLengthM}" >> metadata.tsv - echo -e "Manual\t-\t${endsManual}\t-\t-\t-" >> metadata.tsv - echo -e "Measured\t-\t-\t-\t-\t${readLengthI}" + echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRead Length\tTIN" > metadata.tsv + echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-" >> metadata.tsv + echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t${readLengthM}\t-" >> metadata.tsv + echo -e "Manual\t-\t${endsManual}\t-\t-\t-\t-" >> metadata.tsv + echo -e "Measured\t-\t-\t-\t-\t${readLengthI}\t${tinMedI}" >> metadata.tsv # remove inner distance report if it is empty (SE repRID) echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log diff --git a/workflow/scripts/tinHist.py b/workflow/scripts/tinHist.py index df32a985c165e2b266fb26fc347bd13eb1580999..6cff3b9abbcec4bdb91edacfc4c12842453f19e5 100644 --- a/workflow/scripts/tinHist.py +++ b/workflow/scripts/tinHist.py @@ -15,6 +15,7 @@ def get_args(): def main(): args = get_args() tin = pd.read_csv(args.repRID + '.sorted.deduped.tin.xls',sep="\t",header=0) + hist = pd.cut(tin['TIN'],bins=pd.interval_range(start=0,freq=10,end=100,closed='right')).value_counts(sort=False) labels = ["{0} - {1}".format(i, i + 9) for i in range(1, 100, 10)] #labels[0] = '0 - 10' @@ -34,6 +35,7 @@ def main(): hist = hist.T.fillna(0.0).astype(int) #hist = hist.apply(lambda x: x/x.sum()*100, axis=1) hist.to_csv(args.repRID + '.tin.hist.tsv',sep='\t') + tin['TIN'][(tin['TIN']!=0)].median().to_csv(args.repRID + '.tin.med.csv',sep=',') if __name__ == '__main__': main() \ No newline at end of file