From cbba6901ca2570b8c526e978eb80d898d7997eef Mon Sep 17 00:00:00 2001
From: Gervaise Henry <GHENRY@swmed.org>
Date: Mon, 13 Jan 2020 10:05:03 -0600
Subject: [PATCH] Get deriva download of bagit started

---
 workflow/conf/replicate_export_config.json | 97 ++++++++++++++++++++++
 workflow/rna-seq.nf                        | 40 +++++++--
 2 files changed, 132 insertions(+), 5 deletions(-)
 create mode 100644 workflow/conf/replicate_export_config.json

diff --git a/workflow/conf/replicate_export_config.json b/workflow/conf/replicate_export_config.json
new file mode 100644
index 0000000..ff17fa5
--- /dev/null
+++ b/workflow/conf/replicate_export_config.json
@@ -0,0 +1,97 @@
+{
+  "bag": {
+    "bag_name": "Replicate_{rid}",
+    "bag_algorithms": [
+      "md5"
+    ],
+    "bag_archiver": "zip"
+  },
+  "catalog": {
+    "query_processors": [
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Study",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Antibodies",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Custom Metadata",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Settings",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Replicate",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Anatomical_Source",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Cell_Types",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Single Cell Metrics",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "File",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none"
+        }
+      },
+      {
+        "processor": "fetch",
+        "processor_params": {
+          "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
+        }
+      }
+    ]
+  }
+}
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 035faa8..5470fe1 100755
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -1,16 +1,21 @@
 #!/usr/bin/env nextflow
 
 // Define input variables
-params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt"
-params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip"
+params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/credential.json"
+params.bdbag = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt"
+params.repRID = "16-1ZX4"
+//params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip"
 
 params.outDir = "${baseDir}/../output"
 
 // Parse input variables
 deriva = file(params.deriva, checkIfExists: 'true')
-bdbag = Channel
-  .fromPath(params.bdbag)
-  .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }
+bdbag = file(params.bdbag, checkIfExists: 'true')
+//bdbag = Channel
+//  .fromPath(params.bdbag)
+//  .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }
+
+repRID = params.repRID
 
 outDir = params.outDir
 logsDir = "${outDir}/Logs"
@@ -18,6 +23,7 @@ logsDir = "${outDir}/Logs"
 /*
  * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
  */
+ /*
 process splitData {
   tag "${bdbag.baseName}"
   executor 'local'
@@ -52,6 +58,30 @@ process splitData {
     echo "LOG: bag recreated with replicate split fetch file" >> ${bdbag.baseName}.splitData.err
     """
 }
+*/
+
+
+/*
+ * getData: get bagit file from consortium
+ */
+process getBag {
+  tag "${repRID}"
+  publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${rep.baseName}.getBag.err"
+
+  input:
+    path deriva
+    val repRID
+
+  output:
+    file 
+
+  script:
+    """
+    hostname >>${rep.baseName}.getData.err
+    ulimit -a >>${rep.baseName}.getData.err
+    """
+}
+
 
 /*
  * getData: fetch study files from consortium with downloaded bdbag.zip
-- 
GitLab