diff --git a/test_data/design_ENCSR238SGC_SE.txt b/test_data/design_ENCSR238SGC_SE.txt new file mode 100644 index 0000000000000000000000000000000000000000..80d4b42ac047e37d15036f9638f51e5ef33c6c2e --- /dev/null +++ b/test_data/design_ENCSR238SGC_SE.txt @@ -0,0 +1,5 @@ +sample_id biosample factor treatment replicate control_id fastq_read1 +ENCSR238SGC limb H3K4me1 None 1 ENCSR687ALB ENCFF833BLU.fastq.gz +ENCSR238SGC limb H3K4me1 None 2 ENCSR687ALB ENCFF646LXU.fastq.gz +ENCSR687ALB limb Control None 1 ENCSR687ALB ENCFF524CAC.fastq.gz +ENCSR687ALB limb Control None 2 ENCSR687ALB ENCFF163AJI.fastq.gz diff --git a/test_data/design_ENCSR729LGA_PE.txt b/test_data/design_ENCSR729LGA_PE.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bd9fa22263b465223faa1a8bd5e4f91cbee2334 --- /dev/null +++ b/test_data/design_ENCSR729LGA_PE.txt @@ -0,0 +1,5 @@ +sample_id biosample factor treatment replicate control_id fastq_read1 fastq_read2 +ENCSR729LGA MCF-7 SP1 None 1 ENCSR217LRF ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz +ENCSR729LGA MCF-7 SP1 None 2 ENCSR217LRF ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz +ENCSR217LRF MCF-7 Control None 1 ENCSR217LRF ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz +ENCSR217LRF MCF-7 Control None 1 ENCSR217LRF ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz diff --git a/test_data/fetch_test_data.sh b/test_data/fetch_test_data.sh index 31c3b8fd3257a545acbc8127c4ab77ecd85ed6aa..a6107cdd49252429c01b54313fb64e3479ee5813 100644 --- a/test_data/fetch_test_data.sh +++ b/test_data/fetch_test_data.sh @@ -1,4 +1,17 @@ -echo "Downloading test set..." -wget -O ENCLB904PZW_R1.fastq.gz https://www.encodeproject.org/files/ENCFF704XKC/@@download/ENCFF704XKC.fastq.gz -wget -O ENCLB904PZW_R2.fastq.gz https://www.encodeproject.org/files/ENCFF707CNX/@@download/ENCFF707CNX.fastq.gz -echo "Done" +echo "Downloading Single-end data set Mouse ENCSR238SGC and ENCSR687ALB" +wget https://www.encodeproject.org/files/ENCFF833BLU/@@download/ENCFF833BLU.fastq.gz +wget https://www.encodeproject.org/files/ENCFF646LXU/@@download/ENCFF646LXU.fastq.gz +wget https://www.encodeproject.org/files/ENCFF524CAC/@@download/ENCFF524CAC.fastq.gz +wget https://www.encodeproject.org/files/ENCFF163AJI/@@download/ENCFF163AJI.fastq.gz +echo "Done with Single-end" + +echo "Downloading Paired-end data set Human ENCSR729LGA and ENCSR217LRF" +wget https://www.encodeproject.org/files/ENCFF957SQS/@@download/ENCFF957SQS.fastq.gz +wget https://www.encodeproject.org/files/ENCFF582IOZ/@@download/ENCFF582IOZ.fastq.gz +wget https://www.encodeproject.org/files/ENCFF330MCZ/@@download/ENCFF330MCZ.fastq.gz +wget https://www.encodeproject.org/files/ENCFF293YFE/@@download/ENCFF293YFE.fastq.gz +wget https://www.encodeproject.org/files/ENCFF002DTU/@@download/ENCFF002DTU.fastq.gz +wget https://www.encodeproject.org/files/ENCFF002EFI/@@download/ENCFF002EFI.fastq.gz +wget https://www.encodeproject.org/files/ENCFF002EFG/@@download/ENCFF002EFG.fastq.gz +wget https://www.encodeproject.org/files/ENCFF002DTS/@@download/ENCFF002DTS.fastq.gz +echo "Done with Paired-end" diff --git a/workflow/main.nf b/workflow/main.nf index 1777855160f2deab00d98a76ead04d201031d27e..c025d3b4c79334161f7885b8705f3e544822dd2e 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -3,31 +3,48 @@ // Path to an input file, or a pattern for multiple inputs // Note - $baseDir is the location of this workflow file main.nf -params.reads = "$baseDir/../test_data/*_R{1,2}.fastq.gz" -params.singleEnd = false - +// Define Input variables +params.reads = "$baseDir/../test_data/*.fastq.gz" +params.pairedEnd = false +params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt" +// Define List of Files Channel - .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 ) - .ifEmpty { error "Cannot find any reads matching: ${params.reads}\nIf this is single-end data, please specify."} - .set { read_pairs } + .fromPath( params.reads ) + .flatten() + .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")} + .collectFile( name: 'fileList.tsv', newLine: true ) + .set { readsList } + +// Define regular variables +pairedEnd = params.pairedEnd +designFile = params.designFile + +process checkDesignFile { + + publishDir "$baseDir/output/design", mode: 'copy' -process qc_fastq { - tag "$name" + input: - publishDir "$baseDir/output/$name/$task.process", mode: 'copy' + designFile + file readsList - input: - set val(name), file(reads) from read_pairs + output: - output: - file "*_fastqc.{zip,html}" into qc_fastq_results - file "qc.log" into qc_fastq_log + file("design.tsv") into designFilePaths - script: + script: + + if (pairedEnd) { + """ + echo $designFile + python $baseDir/scripts/check_design.py -d $designFile -f $readsList -p """ - module load python/3.6.1-2-anaconda - module load fastqc/0.11.5 - $baseDir/scripts/qc_fastq.py -f $reads + } + else { """ + python $baseDir/scripts/check_design.py -d $designFile -f $readsList + """ + } + } diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py index beb59b25ffa2e7f4d9f7f3f66cbaf0eee464d3c5..929a97dc3b12da8ddd254671bdb62ff4a194c063 100644 --- a/workflow/scripts/check_design.py +++ b/workflow/scripts/check_design.py @@ -35,7 +35,7 @@ def get_args(): parser.add_argument('-p', '--paired', help="True/False if paired-end or single end.", - default=True, + default=False, action='store_true') args = parser.parse_args() @@ -119,7 +119,7 @@ def main(): # Read files design_file = pd.read_csv(args.design, sep='\t') - fastq_file = pd.read_csv(args.design, sep='\t', names=['name', 'path']) + fastq_file = pd.read_csv(args.fastq, sep='\t', names=['name', 'path']) # Check design file check_design_headers(design_file, args.paired) @@ -127,7 +127,7 @@ def main(): new_design = check_files(design_file, fastq_file, args.paired) # Write out new design file - new_design.to_csv('design.tsv', header=True, index=False) + new_design.to_csv('design.tsv', header=True, sep='\t', index=False) if __name__ == '__main__':