From 92f9b5007185fc27e7906cc6ec6032904a064031 Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Mon, 28 Oct 2019 20:49:46 -0500 Subject: [PATCH] Add file rename in fetch.txt to modifyFetch.py --- workflow/rna-seq.nf | 15 +-------------- workflow/scripts/modifyFetch.py | 8 +++++++- workflow/scripts/renameFastq.sh | 12 ------------ 3 files changed, 8 insertions(+), 27 deletions(-) delete mode 100644 workflow/scripts/renameFastq.sh diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 32d4077..fe52377 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -9,9 +9,6 @@ params.outDir = "${baseDir}/../output" // Parse input variables deriva = file(params.deriva) deriva.copyTo('~/.bdbag/deriva-cookies.txt') -//deriva = Channel -// .fromPath(params.deriva) -// .ifEmpty { exit 1, "deriva cookie file not found: ${params.deriva}" } bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } @@ -19,7 +16,7 @@ bdbag = Channel outDir = params.outDir /* - * splitData: split bdbag files by replicate so fetch can occure in parallel + * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid */ process splitData { tag "${bdbag.baseName}" @@ -27,7 +24,6 @@ process splitData { input: file bdbag - file deriva output: file("Replicate_*.zip") into bdbagSplit mode flatten @@ -39,15 +35,11 @@ process splitData { """ hostname ulimit -a - ln -sf `readlink -e ${deriva}` ~/.bdbag/deriva-cookies.txt study=`echo "${bdbag}" | cut -d'.' -f1` echo LOG: \${study} unzip ${bdbag} echo LOG: bdgag unzipped python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} - cd \${study} - bash ${baseDir}/scripts/fixFetch.sh - cd .. echo LOG: fetch file filtered for only .fastq.gz python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} echo LOG: fetch file split by replicates @@ -67,7 +59,6 @@ process getData { publishDir "${outDir}/temp/${task.process}", mode: "symlink" input: - file deriva each rep from bdbagSplit output: @@ -79,15 +70,11 @@ process getData { ulimit -a echo LOG:\${http_proxy} export https_proxy=\${http_proxy} - ln -sf `readlink -e ${deriva}` ~/.bdbag/deriva-cookies.txt replicate=\$(echo "${rep}" | cut -d'.' -f1 | rev | cut -f1 -d '/' | rev) echo LOG: \${replicate} - echo LOG: deriva cookie loaded unzip ${rep} echo LOG: replicate bdbag unzipped sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} echo LOG: replicate bdbag fetched - sh ${baseDir}/scripts/renameFastq.sh \${replicate} - echo LOG: fastq.gz files renamed to replicate RID """ } diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py index bae8c22..82b1d4c 100644 --- a/workflow/scripts/modifyFetch.py +++ b/workflow/scripts/modifyFetch.py @@ -2,6 +2,7 @@ import argparse import pandas as pd +import re def get_args(): parser = argparse.ArgumentParser() @@ -12,8 +13,13 @@ def get_args(): def main(): args = get_args() fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) + fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0) + fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"] fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"] - fetchFile_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) + fetchFile_filtered_renamed = fetchFile_filtered + for i in fileFile_filtered["File_Name"]: + fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0]) + fetchFile_filtered_renamed.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) if __name__ == '__main__': main() \ No newline at end of file diff --git a/workflow/scripts/renameFastq.sh b/workflow/scripts/renameFastq.sh deleted file mode 100644 index c57eccb..0000000 --- a/workflow/scripts/renameFastq.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin - -while read loc checksum fileLocation; do - file=$(echo ${fileLocation##*/}); - fileName=$(echo ${file%.R*.fastq.gz}); - fileExt=$(echo ${file##${fileName}.}); - while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID; do - if [ "${file}" == "${File_Name}" ]; then - find . -type f -name "${file}" -execdir mv {} ${Replicate_RID}.${fileExt} ';'; - fi; - done < $1/data/File.csv; -done < $1/fetch.txt; -- GitLab