Add uploadInputBag

eb83dee4 · Gervaise Henry · dcda53fd · eb83dee4 · eb83dee4 · eb83dee4
Commit eb83dee4 authored 4 years ago by Gervaise Henry
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
-# v0.0.4 (in development)
+# v1.0.0 (in development)
 **User Facing**
 * Add option to pull references from datahub
 * Add option to send email on workflow error, with pipeline error message
 * Add versions and paper references of software used to report
+* Upload input bag

 **Background**
 * Remove (comment out) option to pull references from S3

--- a/workflow/conf/aws.config
+++ b/workflow/conf/aws.config
@@ -88,4 +88,8 @@ process {
    cpus = 1
    memory = '1 GB'
  }
+  withName: uploadInputBag {
+    cpus = 1
+    memory = '1 GB'
+  }
 }
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -61,6 +61,9 @@ process {
  withName: outputBag {
    executor = 'local'
  }
+  withName: uploadInputBag {
+    executor = 'local'
+  }
 }

 singularity {

--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -70,6 +70,9 @@ process {
  withName:outputBag {
    container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
  }
+  withName:uploadInputBag {
+    container = 'gudmaprbk/deriva1.3:1.0.0'
+  }
 }

 trace {
@@ -98,6 +101,6 @@ manifest {
  homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
  description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
  mainScript = 'rna-seq.nf'
-  version = 'v0.0.4_indev'
+  version = 'v1.0.0_indev'
  nextflowVersion = '>=19.09.0'
 }
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -36,6 +36,7 @@ deriva.into {
  deriva_getBag
  deriva_getRefInfer
  deriva_getRef
+  deriva_uploadInputBag
 }
 bdbag = Channel
  .fromPath(params.bdbag)
@@ -82,7 +83,7 @@ script_refData = Channel.fromPath("${baseDir}/scripts/extractRefData.py")
 script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R")
 script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R")
 script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py")
-
+script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/uploadInputBag.py")

 /*
 * trackStart: track start of pipeline
@@ -177,6 +178,10 @@ if (inputBagForce != "") {
 } else {
  inputBag = bag
 }
+inputBag.into {
+  inputBag_getData
+  inputBag_uploadInputBag
+}

 /*
 * getData: fetch study files from consortium with downloaded bdbag.zip
@@ -187,7 +192,7 @@ process getData {
  input:
    path script_bdbagFetch
    path cookies, stageAs: "deriva-cookies.txt" from bdbag
-    path inputBag
+    path inputBag from inputBag_getData

  output:
    path ("*.R{1,2}.fastq.gz") into fastqs
@@ -1267,6 +1272,67 @@ process outputBag {
  """
 }

+/* 
+ * uploadInputBag: uploads the input bag
+*/
+process uploadInputBag {
+  tag "${repRID}"
+
+  input:
+    path script_uploadInputBag
+    path inputBag from inputBag_uploadInputBag
+    path credential, stageAs: "credential.json" from deriva_uploadInputBag
+
+  output:
+    path ("inputBagRID.csv") into inputBagRIDfl
+
+  script:
+  """
+  hostname > ${repRID}.uploadInputBag.log
+  ulimit -a >> ${repRID}.uploadInputBag.log
+
+  yr=\$(date +'%Y')
+  mn=\$(date +'%m')
+  dy=\$(date +'%d')
+
+  hatrac=\$(deriva-hatrac-cli --host dev.gudmap.org ls /hatrac/resources/rnaseq/pipeline/input_bag/ | grep -o \${yr}_\${mn}_\${dy})
+  if [ -z "\${hatrac}" ]
+  then
+    deriva-hatrac-cli --host ${source} mkdir /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy}
+    echo LOG: hatrac folder created - /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadInputBag.log
+  else
+    echo LOG: hatrac folder already exists - /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy} >> ${repRID}.uploadInputBag.log
+  fi
+
+  file=\$(basename -a ${inputBag})
+  md5=\$(md5sum ./\${file} | awk '{ print \$1 }')
+  echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log
+  size=\$(wc -c < ./\${file})
+  echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log
+  exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5})
+  if [ "\${exist}" == "[]" ]
+  then
+      cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
+      cookie=\${cookie:11:-1}
+      loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/\${yr}_\${mn}_\${dy}/\${file})
+      inputBag_rid=\$(python3 uploadInputBag.py -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie})
+      echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log
+      rid=\${inputBag_rid}
+  else
+      exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
+      exist=\${exist:8:-6}
+      echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log
+      rid=\${exist}
+  fi
+
+  echo \${rid} > inputBagRID.csv
+  """
+}
+
+inputBagRID = Channel.create()
+inputBagRIDfl.splitCsv(sep: ",", header: false).separate(
+  inputBagRID
+)

 workflow.onError = {
  subject = "$workflow.manifest.name FAILED: $params.repRID"

--- a/workflow/scripts/uploadInputBag.py
+++ b/workflow/scripts/uploadInputBag.py
+import argparse
+from deriva.core import ErmrestCatalog, get_credential, BaseCLI
+import sys
+import csv
+from datetime import datetime
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--file', help="file", required=True)
+    parser.add_argument('-l', '--loc', help="location", required=True)
+    parser.add_argument('-s', '--md5', help="md5", required=True)
+    parser.add_argument('-b', '--bytes', help="bytes", required=True)
+    parser.add_argument('-o', '--host', help="bytes", required=True)
+    parser.add_argument('-c', '--cookie', help="bytes", required=True)
+    args = parser.parse_args()
+    return args
+
+def main(hostname, catalog_number, credential):
+    catalog = ErmrestCatalog('https', hostname, catalog_number, credential)
+    pb = catalog.getPathBuilder()
+    inputBag_table = pb.RNASeq.Input_Bag
+
+    inputBag_data = {
+        "File_Name": args.file,
+        "File_URL": args.loc,
+        "File_MD5": args.md5,
+        "File_Bytes": args.bytes,
+        "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(),
+        "Notes": "TEST",
+        "Bag_Type": "Replicate_Input_Seq"
+        }
+
+    entities = inputBag_table.insert([inputBag_data])
+    rid = entities[0]["RID"]
+
+    print(rid)
+    
+        
+if __name__ == '__main__':
+    args = get_args()
+    cli = BaseCLI("Custom RNASeq query", None, 1)
+    cli.remove_options(["--config-file"])
+    host = args.host
+    credential = {"cookie": args.cookie}
+    main(host, 2, credential)
\ No newline at end of file