From a2cbcce164f4f3c9835852834a05576257cbd5d3 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Wed, 6 Jan 2021 22:59:11 -0600
Subject: [PATCH] Make new inputBag for testing with txt fetch instead of fastq

---
 .gitlab-ci.yml                                |  2 +-
 test_data/createTestData.sh                   |  5 +-
 .../conf/Replicate_For_Input_Bag(test).json   | 97 +++++++++++++++++++
 workflow/scripts/bdbag_fetch.sh               | 40 ++++----
 4 files changed, 118 insertions(+), 26 deletions(-)
 create mode 100644 workflow/conf/Replicate_For_Input_Bag(test).json

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3a60d00..420ba8c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -85,7 +85,7 @@ getData:
   - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bdbag --version > version_bdbag.txt
   - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
   - unzip ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip
-  - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bash ./workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6 TEST
+  - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bash ./workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6
   - pytest -m getData
   artifacts:
     name: "$CI_JOB_NAME"
diff --git a/test_data/createTestData.sh b/test_data/createTestData.sh
index e3e4c52..6ae47ec 100644
--- a/test_data/createTestData.sh
+++ b/test_data/createTestData.sh
@@ -12,11 +12,12 @@ mkdir -p NEW_test_data
 ln -sfn ./test_data/auth/credential.json ~/.deriva/credential.json
 
 mkdir -p ./NEW_test_data/bag
-singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ../workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6
+singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 '../workflow/conf/Replicate_For_Input_Bag(test).json' . rid=Q-Y5F6
 cp Q-Y5F6_inputBag.zip ./NEW_test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip
+singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ../workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6
 
 mkdir -p ./NEW_test_data/fastq
-unzip ./NEW_test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip
+unzip ./Q-Y5F6_inputBag.zip
 singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bash ../workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6
 cp Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Q-Y5F6.R1.fastq.gz
 cp Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Q-Y5F6.R2.fastq.gz
diff --git a/workflow/conf/Replicate_For_Input_Bag(test).json b/workflow/conf/Replicate_For_Input_Bag(test).json
new file mode 100644
index 0000000..46fefe8
--- /dev/null
+++ b/workflow/conf/Replicate_For_Input_Bag(test).json
@@ -0,0 +1,97 @@
+{
+  "bag": {
+    "bag_name": "{rid}_inputBag",
+    "bag_algorithms": [
+      "md5"
+    ],
+    "bag_archiver": "zip"
+  },
+  "catalog": {
+    "query_processors": [
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Study",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Antibodies",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Custom Metadata",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Settings",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Replicate",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Anatomical_Source",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Cell_Types",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Single Cell Metrics",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "File",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none"
+        }
+      },
+      {
+        "processor": "fetch",
+        "processor_params": {
+          "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=txt/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
+        }
+      }
+    ]
+  }
+}
diff --git a/workflow/scripts/bdbag_fetch.sh b/workflow/scripts/bdbag_fetch.sh
index b3b1dd4..6a6253c 100644
--- a/workflow/scripts/bdbag_fetch.sh
+++ b/workflow/scripts/bdbag_fetch.sh
@@ -1,30 +1,24 @@
 #!/bin/bash
 
-if [ -z "${3}" ]
+bdbag --materialize ${1} --debug
+validateError=true
+bdbag --validate full ${1} && validateError=false
+if validateError
 then
-        bdbag --materialize ${1} --debug
-    validateError=true
-    bdbag --validate full ${1} && validateError=false
-    if validateError
-    then
-        n=0
-        until [ "${n}" -ge "3" ]
-        do
-            bdbag --resolve-fetch missing --validate full ${1} --debug && validateError=false && break
-            n=$((n+1)) 
-            sleep 15
-        done
-    fi
-    if validateError
-    then
-        exit 1
-    fi
-    for i in $(find */ -name "*R*.fastq.gz")
+    n=0
+    until [ "${n}" -ge "3" ]
     do
-        path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz")
-        cp ${i} ./${path}
+        bdbag --resolve-fetch missing --validate full ${1} --debug && validateError=false && break
+        n=$((n+1)) 
+        sleep 15
     done
-elif [ "${3}" == "TEST" ]
+fi
+if validateError
 then
-    bdbag --validate-profile ${1} --debug
+    exit 1
 fi
+for i in $(find */ -name "*R*.fastq.gz")
+do
+    path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz")
+    cp ${i} ./${path}
+done
\ No newline at end of file
-- 
GitLab