diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index cf9d8a32de8c8824f89f4575c5d1a3818d1f0092..0f884de970f735e91f10cacfbb6777cb2b68495b 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -13,12 +13,17 @@ author: 'Felix Perez, Achisha Saikia, Peng Lian' # A contact email address for questions email: 'felix.perez@utsouthwestern.edu, achisha.saikia@utsouthwestern.edu, biohpc-help@utsouthwestern.edu' # A more informative title for the workflow package -title: 'ATAC-seq Source Workflow" - -This pipeline is designed for automated end-to-end quality control and processing of ATAC-seq. The pipeline can be run end-to-end, starting from raw FASTQ files all the way to peak calling and signal track generation using a single caper submit command. One can also start the pipeline from intermediate stages (for example, using alignment files as input). The pipeline supports both single-end and paired-end data as well as replicated or non-replicated datasets. The outputs produced by the pipeline include 1) formatted HTML reports that include quality control measures specifically designed for ATAC-seq and DNase-seq data, 2) analysis of reproducibility, 3) stringent and relaxed thresholding of peaks, 4) fold-enrichment and pvalue signal tracks. +title: 'ATAC-seq Source Workflow' description: | - # TODO: Please describe the workflow. (AS) + This pipeline is designed for automated end-to-end quality control and processing of ATAC-seq. + The pipeline can be run end-to-end, starting from raw FASTQ files all the way to peak calling + and signal track generation using a single caper submit command. One can also start the pipeline from intermediate + stages (for example, using alignment files as input). The pipeline supports both single-end and paired-end data as + well as replicated or non-replicated datasets. The outputs produced by the pipeline include 1) formatted HTML reports + that include quality control measures specifically designed for ATAC-seq and DNase-seq data, 2) analysis of + reproducibility, 3) stringent and relaxed thresholding of peaks, 4) fold-enrichment and pvalue signal tracks. + #### New Features in Astrocyte 0.4.0 and above #### citation: | diff --git a/workflow/configs/biohpc.config b/workflow/configs/biohpc.config index 465d7e7bd09bbd7b6bcf8541c74fc87b271d0b73..9ff1e4bf38d24e68075bd6dbfd7719d75ca63741 100755 --- a/workflow/configs/biohpc.config +++ b/workflow/configs/biohpc.config @@ -1,6 +1,12 @@ singularity { enabled = true - runOptions = ' -B /cm/shared/apps/slurm/16.05.8 -B /etc/slurm -B /cm/shared/apps/slurm/var/etc/ -B /usr/lib64/libreadline.so.6 -B /usr/lib64/libhistory.so.6 -B /usr/lib64/libtinfo.so.5 -B /var/run/munge -B /usr/lib64/libmunge.so.2 -B /usr/lib64/libmunge.so.2.0.0 -B /cm/shared/apps/slurm/16.05.8/lib64/slurm/ -B /cm/shared/apps/slurm/16.05.8/lib64' + runOptions = '' + // Below connects the experimental atac container to BioHPC's Slurm job scheduler. + // runOptions = '\ + // --bind /cm/shared/apps/slurm/16.05.8,/etc/slurm,/cm/shared/apps/slurm/var/etc/,/usr/lib64/libreadline.so.6 \ + // --bind /usr/lib64/libhistory.so.6,/usr/lib64/libtinfo.so.5,/var/run/munge,/usr/lib64/libmunge.so.2 \ + // --bind /usr/lib64/libmunge.so.2.0.0,/cm/shared/apps/slurm/16.05.8/lib64/slurm/' + // Please do NOT use "--disable-cache" in this runOptions. // Starting from version 2.0.0, the astrocyte_cli will clean up the cache automatically. // runOptions = '--bind /vagrant:/vagrant' // Use this one for vagrant development env only @@ -16,7 +22,8 @@ process { beforeScript = 'ulimit -Ss unlimited' withName:runSource { - container = 'docker://git.biohpc.swmed.edu:5050/s219741/astrocyte-atac-source/atac:0.0.1' + // Experimental containerized version of the caper software. + // container = 'docker://git.biohpc.swmed.edu:5050/s219741/astrocyte-atac-source/atac:0.0.1' executor = 'local' } } diff --git a/workflow/main.nf b/workflow/main.nf index adc3f4bb718cd94cdd965e1f80caf985f165f3b2..7ebcf8035ecfd95091552ab9928c03fae85d27dc 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,8 +1,11 @@ /* * Copyright (c) 2024. The University of Texas Southwestern Medical Center * - * TODO: (AC) Brief description of ATAC-seq (DONE) - * ATAC-seq is a molecular biology technique that assesses chromatin accessibility in a genome. It uses a hyperactive Tn5 transposase to insert sequencing adapters into open chromatin regions, allowing researchers to identify and sequence these accessible genomic regions. ATAC-seq is widely used to study gene regulation, identify enhancers and promoters, and gain insights into chromatin structure. + * ATAC-seq is a molecular biology technique that assesses chromatin accessibility in a genome. + * It uses a hyperactive Tn5 transposase to insert sequencing adapters into open chromatin regions, + * allowing researchers to identify and sequence these accessible genomic regions. ATAC-seq is widely + * used to study gene regulation, identify enhancers and promoters, and gain insights into chromatin structure. + * * @authors * Felix Perez, Achisha Saikia * @@ -23,10 +26,43 @@ process runSource { output: file '*' - """ - export LD_LIBRARY_PATH=/usr/lib64/:$LD_LIBRARY_PATH + shell: + ''' + # Enable the use of bash-specific conda commands in this shell. + eval "$(conda shell.bash hook)" + + module load python/3.8.x-anaconda + module load openjdk/18 + + # Create a temporary conda environment for caper. + conda create -y -c bioconda -c defaults -c conda-forge --name astrocyte-atac-caper python=3.8.18 + # Activate and install caper. + conda activate astrocyte-atac-caper + export PATH=$PATH:~/.local/bin + pip install caper + # Creates a directory in the user's home2: ~/.caper/ + caper init slurm + + # Record the relevant software versions. + java -version 2> java_version.txt sinfo -V > slurm_version.txt caper --version > caper_version.txt - caper hpc submit $baseDir/external_repo/astrocyte-atac-runner/atac.wdl -i $inputJson --singularity --leader-job-name atac-source 1> batch_job.txt 2>> caper_err.txt - """ + + # Launch the ATAC-seq leader job. + jobsubmit=$(caper hpc submit !{baseDir}/external_repo/astrocyte-atac-runner/atac.wdl -i !{inputJson} --singularity --leader-job-name atac-source) + + # Monitor the state of the leader job; if it enters the COMPLETED, FAILED, or CANCELLED state, then finish the workflow process. + state=$(bash !{baseDir}/scripts/checkJobState.sh "${jobsubmit}") + echo "Lead Job state check $(date) - State: $state" >> lead_job_check.txt + while [[ "$state" != *"COMPLETED"* ]] && [[ "$state" != *"FAILED"* ]] && [[ "$state" != *"CANCELLED"* ]]; do + sleep 15 + state=$(bash !{baseDir}/scripts/checkJobState.sh "${jobsubmit}") + echo "Lead Job state check $(date) - State: $state" >> lead_job_check.txt + done + + # Deactivate the temporary caper conda environment and delete it. + conda deactivate + conda remove --name astrocyte-atac-caper --all + rm -rf ~/.caper/ + ''' } diff --git a/workflow/scripts/checkJobState.sh b/workflow/scripts/checkJobState.sh new file mode 100755 index 0000000000000000000000000000000000000000..d3e8d219222acf4679cb5aaa126ace232f91a804 --- /dev/null +++ b/workflow/scripts/checkJobState.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# Get the jobID of the caper lead job from the input txt file. +read -ra line <<< "$1" +jobID=${line[3]} + +# Query Slurm for the state of the caper lead job. +jobq=$(sacct --format State -j $jobID) + +# Return the caper lead job state. +IFS=$'\n' read -rd '' -a jobstate <<< "$jobq" +echo $(echo "${jobstate[2]}" | xargs) \ No newline at end of file