From 92f9b5007185fc27e7906cc6ec6032904a064031 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Mon, 28 Oct 2019 20:49:46 -0500
Subject: [PATCH] Add file rename in fetch.txt to modifyFetch.py

---
 workflow/rna-seq.nf             | 15 +--------------
 workflow/scripts/modifyFetch.py |  8 +++++++-
 workflow/scripts/renameFastq.sh | 12 ------------
 3 files changed, 8 insertions(+), 27 deletions(-)
 delete mode 100644 workflow/scripts/renameFastq.sh

diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 32d4077..fe52377 100755
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -9,9 +9,6 @@ params.outDir = "${baseDir}/../output"
 // Parse input variables
 deriva = file(params.deriva)
 deriva.copyTo('~/.bdbag/deriva-cookies.txt')
-//deriva = Channel
-//  .fromPath(params.deriva)
-//  .ifEmpty { exit 1, "deriva cookie file not found: ${params.deriva}" }
 bdbag = Channel
   .fromPath(params.bdbag)
   .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }
@@ -19,7 +16,7 @@ bdbag = Channel
 outDir = params.outDir
 
 /*
- * splitData: split bdbag files by replicate so fetch can occure in parallel
+ * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
  */
 process splitData {
   tag "${bdbag.baseName}"
@@ -27,7 +24,6 @@ process splitData {
 
   input:
     file bdbag
-    file deriva
 
   output:
     file("Replicate_*.zip") into bdbagSplit mode flatten
@@ -39,15 +35,11 @@ process splitData {
     """
     hostname
     ulimit -a
-    ln -sf `readlink -e ${deriva}` ~/.bdbag/deriva-cookies.txt
     study=`echo "${bdbag}" | cut -d'.' -f1`
     echo LOG: \${study}
     unzip ${bdbag}
     echo LOG: bdgag unzipped
     python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study}
-    cd \${study}
-    bash ${baseDir}/scripts/fixFetch.sh
-    cd ..
     echo LOG: fetch file filtered for only .fastq.gz
     python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study}
     echo LOG: fetch file split by replicates
@@ -67,7 +59,6 @@ process getData {
   publishDir "${outDir}/temp/${task.process}", mode: "symlink"
 
   input:
-    file deriva
     each rep from bdbagSplit
 
   output:
@@ -79,15 +70,11 @@ process getData {
     ulimit -a
     echo LOG:\${http_proxy}
     export https_proxy=\${http_proxy}
-    ln -sf `readlink -e ${deriva}` ~/.bdbag/deriva-cookies.txt
     replicate=\$(echo "${rep}" | cut -d'.' -f1 | rev | cut -f1 -d '/' | rev)
     echo LOG: \${replicate}
-    echo LOG: deriva cookie loaded
     unzip ${rep}
     echo LOG: replicate bdbag unzipped
     sh ${baseDir}/scripts/bdbagFetch.sh \${replicate}
     echo LOG: replicate bdbag fetched
-    sh ${baseDir}/scripts/renameFastq.sh \${replicate}
-    echo LOG: fastq.gz files renamed to replicate RID
     """
  }
diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py
index bae8c22..82b1d4c 100644
--- a/workflow/scripts/modifyFetch.py
+++ b/workflow/scripts/modifyFetch.py
@@ -2,6 +2,7 @@
 
 import argparse
 import pandas as pd
+import re
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -12,8 +13,13 @@ def get_args():
 def main():
     args = get_args()
     fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
+    fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
+    fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"]
     fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"]
-    fetchFile_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
+    fetchFile_filtered_renamed = fetchFile_filtered
+    for i in fileFile_filtered["File_Name"]:
+        fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0])
+    fetchFile_filtered_renamed.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
 
 if __name__ == '__main__':
     main()
\ No newline at end of file
diff --git a/workflow/scripts/renameFastq.sh b/workflow/scripts/renameFastq.sh
deleted file mode 100644
index c57eccb..0000000
--- a/workflow/scripts/renameFastq.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin
-
-while read loc checksum fileLocation; do
-  file=$(echo ${fileLocation##*/});
-  fileName=$(echo ${file%.R*.fastq.gz});
-  fileExt=$(echo ${file##${fileName}.});
-  while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID; do
-    if [ "${file}" == "${File_Name}" ]; then
-      find . -type f -name "${file}" -execdir mv {} ${Replicate_RID}.${fileExt} ';';
-    fi;
-    done < $1/data/File.csv;
-done < $1/fetch.txt;
-- 
GitLab