From 8ef6e674ecf65a90b7518843ff71566797ec4a89 Mon Sep 17 00:00:00 2001
From: Venkat Malladi <venkat.malladi@utsouthwestern.edu>
Date: Thu, 28 Sep 2017 13:12:43 -0500
Subject: [PATCH] Pass nextflow test for design file.

---
 test_data/design_ENCSR238SGC_SE.txt |  5 +++
 test_data/design_ENCSR729LGA_PE.txt |  5 +++
 test_data/fetch_test_data.sh        | 21 +++++++++---
 workflow/main.nf                    | 53 +++++++++++++++++++----------
 workflow/scripts/check_design.py    |  6 ++--
 5 files changed, 65 insertions(+), 25 deletions(-)
 create mode 100644 test_data/design_ENCSR238SGC_SE.txt
 create mode 100644 test_data/design_ENCSR729LGA_PE.txt

diff --git a/test_data/design_ENCSR238SGC_SE.txt b/test_data/design_ENCSR238SGC_SE.txt
new file mode 100644
index 0000000..80d4b42
--- /dev/null
+++ b/test_data/design_ENCSR238SGC_SE.txt
@@ -0,0 +1,5 @@
+sample_id	biosample	factor	treatment	replicate	control_id	fastq_read1
+ENCSR238SGC	limb	H3K4me1	None	1	ENCSR687ALB	ENCFF833BLU.fastq.gz
+ENCSR238SGC	limb	H3K4me1	None	2	ENCSR687ALB	ENCFF646LXU.fastq.gz
+ENCSR687ALB	limb	Control	None	1	ENCSR687ALB	ENCFF524CAC.fastq.gz
+ENCSR687ALB	limb	Control	None	2	ENCSR687ALB	ENCFF163AJI.fastq.gz
diff --git a/test_data/design_ENCSR729LGA_PE.txt b/test_data/design_ENCSR729LGA_PE.txt
new file mode 100644
index 0000000..0bd9fa2
--- /dev/null
+++ b/test_data/design_ENCSR729LGA_PE.txt
@@ -0,0 +1,5 @@
+sample_id	biosample	factor	treatment	replicate	control_id	fastq_read1	fastq_read2
+ENCSR729LGA	MCF-7	SP1	None	1	ENCSR217LRF	ENCFF957SQS.fastq.gz	ENCFF582IOZ.fastq.gz
+ENCSR729LGA	MCF-7	SP1	None	2	ENCSR217LRF	ENCFF330MCZ.fastq.gz	ENCFF293YFE.fastq.gz
+ENCSR217LRF	MCF-7	Control	None	1	ENCSR217LRF	ENCFF002DTU.fastq.gz	ENCFF002EFI.fastq.gz
+ENCSR217LRF	MCF-7	Control	None	1	ENCSR217LRF	ENCFF002EFG.fastq.gz	ENCFF002DTS.fastq.gz
diff --git a/test_data/fetch_test_data.sh b/test_data/fetch_test_data.sh
index 31c3b8f..a6107cd 100644
--- a/test_data/fetch_test_data.sh
+++ b/test_data/fetch_test_data.sh
@@ -1,4 +1,17 @@
-echo "Downloading test set..."
-wget -O ENCLB904PZW_R1.fastq.gz https://www.encodeproject.org/files/ENCFF704XKC/@@download/ENCFF704XKC.fastq.gz
-wget -O ENCLB904PZW_R2.fastq.gz https://www.encodeproject.org/files/ENCFF707CNX/@@download/ENCFF707CNX.fastq.gz
-echo "Done"
+echo "Downloading Single-end data set Mouse ENCSR238SGC and ENCSR687ALB"
+wget https://www.encodeproject.org/files/ENCFF833BLU/@@download/ENCFF833BLU.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF646LXU/@@download/ENCFF646LXU.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF524CAC/@@download/ENCFF524CAC.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF163AJI/@@download/ENCFF163AJI.fastq.gz
+echo "Done with Single-end"
+
+echo "Downloading Paired-end data set Human ENCSR729LGA and ENCSR217LRF"
+wget https://www.encodeproject.org/files/ENCFF957SQS/@@download/ENCFF957SQS.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF582IOZ/@@download/ENCFF582IOZ.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF330MCZ/@@download/ENCFF330MCZ.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF293YFE/@@download/ENCFF293YFE.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF002DTU/@@download/ENCFF002DTU.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF002EFI/@@download/ENCFF002EFI.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF002EFG/@@download/ENCFF002EFG.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF002DTS/@@download/ENCFF002DTS.fastq.gz
+echo "Done with Paired-end"
diff --git a/workflow/main.nf b/workflow/main.nf
index 1777855..c025d3b 100644
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -3,31 +3,48 @@
 // Path to an input file, or a pattern for multiple inputs
 // Note - $baseDir is the location of this workflow file main.nf
 
-params.reads = "$baseDir/../test_data/*_R{1,2}.fastq.gz"
-params.singleEnd = false
-
+// Define Input variables
+params.reads = "$baseDir/../test_data/*.fastq.gz"
+params.pairedEnd = false
+params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
 
+// Define List of Files
 Channel
-    .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 )
-    .ifEmpty { error "Cannot find any reads matching: ${params.reads}\nIf this is single-end data, please specify."}
-    .set { read_pairs }
+  .fromPath( params.reads )
+  .flatten()
+  .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")}
+  .collectFile( name: 'fileList.tsv', newLine: true )
+  .set { readsList }
+
+// Define regular variables
+pairedEnd = params.pairedEnd
+designFile = params.designFile
+
+process checkDesignFile {
+
+  publishDir "$baseDir/output/design", mode: 'copy'
 
-process qc_fastq {
-    tag "$name"
+  input:
 
-    publishDir "$baseDir/output/$name/$task.process", mode: 'copy'
+  designFile
+  file readsList
 
-    input:
-    set val(name), file(reads) from read_pairs
+  output:
 
-    output:
-    file "*_fastqc.{zip,html}" into qc_fastq_results
-    file "qc.log" into qc_fastq_log
+  file("design.tsv") into designFilePaths
 
-    script:
+  script:
+
+  if (pairedEnd) {
+    """
+    echo $designFile
+    python $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
     """
-    module load python/3.6.1-2-anaconda
-    module load fastqc/0.11.5
-    $baseDir/scripts/qc_fastq.py -f $reads
+  }
+  else {
     """
+    python $baseDir/scripts/check_design.py -d $designFile -f $readsList
+    """
+  }
+
 }
diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py
index beb59b2..929a97d 100644
--- a/workflow/scripts/check_design.py
+++ b/workflow/scripts/check_design.py
@@ -35,7 +35,7 @@ def get_args():
 
     parser.add_argument('-p', '--paired',
                         help="True/False if paired-end or single end.",
-                        default=True,
+                        default=False,
                         action='store_true')
 
     args = parser.parse_args()
@@ -119,7 +119,7 @@ def main():
 
     # Read files
     design_file = pd.read_csv(args.design, sep='\t')
-    fastq_file = pd.read_csv(args.design, sep='\t', names=['name', 'path'])
+    fastq_file = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
 
     # Check design file
     check_design_headers(design_file, args.paired)
@@ -127,7 +127,7 @@ def main():
     new_design = check_files(design_file, fastq_file, args.paired)
 
     # Write out new design file
-    new_design.to_csv('design.tsv', header=True, index=False)
+    new_design.to_csv('design.tsv', header=True, sep='\t', index=False)
 
 
 if __name__ == '__main__':
-- 
GitLab