diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index cf9d8a32de8c8824f89f4575c5d1a3818d1f0092..0f884de970f735e91f10cacfbb6777cb2b68495b 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -13,12 +13,17 @@ author: 'Felix Perez, Achisha Saikia, Peng Lian' # A contact email address for questions email: 'felix.perez@utsouthwestern.edu, achisha.saikia@utsouthwestern.edu, biohpc-help@utsouthwestern.edu' # A more informative title for the workflow package -title: 'ATAC-seq Source Workflow" - -This pipeline is designed for automated end-to-end quality control and processing of ATAC-seq. The pipeline can be run end-to-end, starting from raw FASTQ files all the way to peak calling and signal track generation using a single caper submit command. One can also start the pipeline from intermediate stages (for example, using alignment files as input). The pipeline supports both single-end and paired-end data as well as replicated or non-replicated datasets. The outputs produced by the pipeline include 1) formatted HTML reports that include quality control measures specifically designed for ATAC-seq and DNase-seq data, 2) analysis of reproducibility, 3) stringent and relaxed thresholding of peaks, 4) fold-enrichment and pvalue signal tracks. +title: 'ATAC-seq Source Workflow' description: | - # TODO: Please describe the workflow. (AS) + This pipeline is designed for automated end-to-end quality control and processing of ATAC-seq. + The pipeline can be run end-to-end, starting from raw FASTQ files all the way to peak calling + and signal track generation using a single caper submit command. One can also start the pipeline from intermediate + stages (for example, using alignment files as input). The pipeline supports both single-end and paired-end data as + well as replicated or non-replicated datasets. The outputs produced by the pipeline include 1) formatted HTML reports + that include quality control measures specifically designed for ATAC-seq and DNase-seq data, 2) analysis of + reproducibility, 3) stringent and relaxed thresholding of peaks, 4) fold-enrichment and pvalue signal tracks. + #### New Features in Astrocyte 0.4.0 and above #### citation: | diff --git a/workflow/configs/biohpc.config b/workflow/configs/biohpc.config index 465d7e7bd09bbd7b6bcf8541c74fc87b271d0b73..94565f9c577a3fb6ab583b3f3a140702d011ee5f 100755 --- a/workflow/configs/biohpc.config +++ b/workflow/configs/biohpc.config @@ -1,6 +1,11 @@ singularity { enabled = true - runOptions = ' -B /cm/shared/apps/slurm/16.05.8 -B /etc/slurm -B /cm/shared/apps/slurm/var/etc/ -B /usr/lib64/libreadline.so.6 -B /usr/lib64/libhistory.so.6 -B /usr/lib64/libtinfo.so.5 -B /var/run/munge -B /usr/lib64/libmunge.so.2 -B /usr/lib64/libmunge.so.2.0.0 -B /cm/shared/apps/slurm/16.05.8/lib64/slurm/ -B /cm/shared/apps/slurm/16.05.8/lib64' + runOptions = '\ + --bind /cm/shared/apps/slurm/16.05.8,/etc/slurm,/cm/shared/apps/slurm/var/etc/,/usr/lib64/libreadline.so.6 \ + --bind /usr/lib64/libhistory.so.6,/usr/lib64/libtinfo.so.5,/var/run/munge,/usr/lib64/libmunge.so.2 \ + --bind /usr/lib64/libmunge.so.2.0.0,/cm/shared/apps/slurm/16.05.8/lib64/slurm/ \ + --bind /cm/shared/apps/java/oracle/jdk1.8.0_231' + // Please do NOT use "--disable-cache" in this runOptions. // Starting from version 2.0.0, the astrocyte_cli will clean up the cache automatically. // runOptions = '--bind /vagrant:/vagrant' // Use this one for vagrant development env only diff --git a/workflow/main.nf b/workflow/main.nf index adc3f4bb718cd94cdd965e1f80caf985f165f3b2..30b3662815c0c15bf3f65cac190ab14e4dc30a45 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,8 +1,11 @@ /* * Copyright (c) 2024. The University of Texas Southwestern Medical Center * - * TODO: (AC) Brief description of ATAC-seq (DONE) - * ATAC-seq is a molecular biology technique that assesses chromatin accessibility in a genome. It uses a hyperactive Tn5 transposase to insert sequencing adapters into open chromatin regions, allowing researchers to identify and sequence these accessible genomic regions. ATAC-seq is widely used to study gene regulation, identify enhancers and promoters, and gain insights into chromatin structure. + * ATAC-seq is a molecular biology technique that assesses chromatin accessibility in a genome. + * It uses a hyperactive Tn5 transposase to insert sequencing adapters into open chromatin regions, + * allowing researchers to identify and sequence these accessible genomic regions. ATAC-seq is widely + * used to study gene regulation, identify enhancers and promoters, and gain insights into chromatin structure. + * * @authors * Felix Perez, Achisha Saikia * @@ -23,10 +26,35 @@ process runSource { output: file '*' - """ - export LD_LIBRARY_PATH=/usr/lib64/:$LD_LIBRARY_PATH + shell: + ''' + # Allow for the container to use the libraries & paths of Slurm on BioHPC. + export LD_LIBRARY_PATH=/atac/jdk-12/lib:/usr/lib64:/lib:$LD_LIBRARY_PATH + export PATH=/atac/jdk-12:/atac/jdk-12/bin:/bin:/cm/shared/apps/slurm/16.05.8/bin:$PATH + + # Provide the container the SlurmUser (user and group) info used on Nucleus. + echo "slurm:x:450:450::/cm/local/apps/slurm:/bin/bash" >> /etc/passwd + echo "slurm:x:450:" >> /etc/group + + # Source the container's entrypoint script to have access to the caper + # commands to run the ATAC-seq pipeline in the runner. + source /atac/entrypoint.sh + + # Record the relevant software versions. + java -version 2> java_version.txt sinfo -V > slurm_version.txt caper --version > caper_version.txt - caper hpc submit $baseDir/external_repo/astrocyte-atac-runner/atac.wdl -i $inputJson --singularity --leader-job-name atac-source 1> batch_job.txt 2>> caper_err.txt - """ + + # Launch the ATAC-seq leader job. + submit=$(caper hpc submit !{baseDir}/external_repo/astrocyte-atac-runner/atac.wdl -i !{inputJson} --singularity --leader-job-name atac-source) + + # Monitor the state of the leader job; if it enters the COMPLETED, FAILED, or CANCELLED state, then finish the workflow process. + state=$(bash !{baseDir}/scripts/checkJobState.sh "${submit}") + echo "Lead Job state check $(date) - State: $state" >> lead_job_check.txt + while [[ "$state" != *"COMPLETED"* ]] && [[ "$state" != *"FAILED"* ]] && [[ "$state" != *"CANCELLED"* ]]; do + sleep 15 + state=$(bash !{baseDir}/scripts/checkJobState.sh "${submit}") + echo "Lead Job state check $(date) - State: $state" >> lead_job_check.txt + done + ''' } diff --git a/workflow/scripts/checkJobState.sh b/workflow/scripts/checkJobState.sh new file mode 100755 index 0000000000000000000000000000000000000000..d3e8d219222acf4679cb5aaa126ace232f91a804 --- /dev/null +++ b/workflow/scripts/checkJobState.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# Get the jobID of the caper lead job from the input txt file. +read -ra line <<< "$1" +jobID=${line[3]} + +# Query Slurm for the state of the caper lead job. +jobq=$(sacct --format State -j $jobID) + +# Return the caper lead job state. +IFS=$'\n' read -rd '' -a jobstate <<< "$jobq" +echo $(echo "${jobstate[2]}" | xargs) \ No newline at end of file