diff --git a/alignment/bamqc.sh b/alignment/bamqc.sh index 903127249645ae983cc454168f014f03cecf474f..7c2b403bcc6b6c9105637e372165d75a72959e65 100644 --- a/alignment/bamqc.sh +++ b/alignment/bamqc.sh @@ -1,9 +1,59 @@ #!/bin/bash #trimgalore.sh -pair_id=$1 -bam=$2 +usage() { + echo "-h Help documentation for hisat.sh" + echo "-r --Reference Genome: GRCh38 or GRCm38" + echo "-b --BAM File" + echo "-n --NucType" + echo "-p --Prefix for output file name" + echo "-c --Capture Bedfile" + echo "Example: bash hisat.sh -p prefix -r GRCh38 -a hisat -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:b:n:p:h opt +do + case $opt in + r) refgeno=$OPTARG;; + b) sbam=$OPTARG;; + c) bed=$OPTARG;; + y) nuctype=$OPTARG;; + p) pair_id=$OPTARG;; + h) usage;; + esac +done -module load samtools/1.4.1 fastqc/0.11.5 -samtools flagstat ${bam} > ${pair_id}.flagstat.txt -fastqc -f bam ${bam} +shift $(($OPTIND -1)) + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then + usage +fi + +if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then + index_path=/project/shared/bicf_workflow_ref/${refgeno} +else + usage +fi + +module load samtools/1.6 fastqc/0.11.5 +samtools flagstat ${sbam} > ${pair_id}.flagstat.txt +fastqc -f bam ${sbam} + +if [[ $nuctype == 'dna' ]]; then + module load bedtools/2.26.0 picard/2.10.3 + samtools view -b --threads $SLURM_CPUS_ON_NODE -L ${bed} -o ${pair_id}.ontarget.bam ${sbam} + samtools index ${pair_id}.ontarget.bam + samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt + samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt + samtools view -b -F 1024 ${pair_id}.ontarget.bam | bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b stdin -hist | grep ^all > ${pair_id}.dedupcov.txt + java -Xmx32g -jar \$PICARD/picard.jar CollectAlignmentSummaryMetrics R=${reffa} I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt + java -Xmx64g -jar \$PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt + samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=\$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt + java -Xmx4g -jar \$PICARD/picard.jar CollectInsertSizeMetrics INPUT=${pair_id}.bam HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${reffa} OUTPUT=${pair_id}.hist.txt + samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt + bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt + perl $baseDir/scripts/calculate_depthcov.pl ${pair_id}.covhist.txt + grep ^all ${pair_id}.covhist.txt > ${pair_id}.genomecov.txt + fi diff --git a/alignment/parse_seqqc.pl b/alignment/bamqc_dna.pl similarity index 100% rename from alignment/parse_seqqc.pl rename to alignment/bamqc_dna.pl diff --git a/alignment/parse_flagstat.pl b/alignment/bamqc_rna.pl similarity index 100% rename from alignment/parse_flagstat.pl rename to alignment/bamqc_rna.pl diff --git a/alignment/dnaseqalign.sh b/alignment/dnaseqalign.sh new file mode 100644 index 0000000000000000000000000000000000000000..98972fef62e8671e10260d7c155e0cef82171338 --- /dev/null +++ b/alignment/dnaseqalign.sh @@ -0,0 +1,65 @@ +#!/bin/bash +#dnaseqalign.sh + +usage() { + echo "-h Help documentation for hisat.sh" + echo "-r --Reference Genome: GRCh38 or GRCm38" + echo "-x --FastQ R1" + echo "-y --FastQ R2" + echo "-p --Prefix for output file name" + echo "Example: bash hisat.sh -p prefix -r GRCh38 -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:a:x:y:p:h opt +do + case $opt in + r) refgeno=$OPTARG;; + x) fq1=$OPTARG;; + y) fq2=$OPTARG;; + a) algo=$OPTARG;; + p) pair_id=$OPTARG;; + h) usage;; + esac +done + +shift $(($OPTIND -1)) + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then + usage +fi + +if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ] || [ $refgeno == 'GRCh37' ]; then + index_path=/project/shared/bicf_workflow_ref/${refgeno} +else + usage +fi + +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi + +module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3 speedseq/20160506 +if ($fq1 != $fq2) +then + bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam +else + bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam +fi +if [ $refgeno == 'GRCh38' ] +then + cat out.sam | k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt | samtools view -1 - > output.unsort.bam + run-HLA ${pair_id}.hla > ${pair_id}.hla.top 2> ${pair_id}.log.hla + touch ${pair_id}.hla.HLA-dummy.gt + cat ${pair_id}.hla.HLA*.gt | grep ^GT | cut -f2- > ${pair_id}.hla.all +else + samtools view -1 -o output.unsort.bam out.sam +fi +samtools sort --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam +samtools view -h output.unsort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam +gawk '{ if (\$0~"^@") { print; next } else { \$10="*"; \$11="*"; print } }' OFS="\\t" splitters.sam | samtools view -S -b - | samtools sort -o ${pair_id}.splitters.bam - +gawk '{ if (\$0~"^@") { print; next } else { \$10="*"; \$11="*"; print } }' OFS="\\t" discordants.sam | samtools view -S -b - | samtools sort -o ${pair_id}.discordants.bam - +java -Djava.io.tmpdir=./ -Xmx4g -jar \$PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.dups.bam O=${pair_id}.bam +samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam diff --git a/alignment/hisat.sh b/alignment/hisat.sh deleted file mode 100644 index 87af91b85d018f8a85baefe62c2c3f89198a1ea3..0000000000000000000000000000000000000000 --- a/alignment/hisat.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -#hisat.sh - -usage() { - echo "-h Help documentation for hisat.sh" - echo "-r --Reference Genome: GRCh38 or GRCm38" - echo "-a --FastQ R1" - echo "-b --FastQ R2" - echo "-p --Prefix for output file name" - echo "Example: bash hisat.sh -p prefix -r GRCh38 -a SRR1551047_1.fastq.gz -b SRR1551047_2.fastq.gz" - exit 1 -} -OPTIND=1 # Reset OPTIND -while getopts :r:a:b:p:h opt -do - case $opt in - r) refgeno=$OPTARG;; - a) fq1=$OPTARG;; - b) fq2=$OPTARG;; - p) pair_id=$OPTARG;; - h) usage;; - esac -done - -shift $(($OPTIND -1)) - -# Check for mandatory options -if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then - usage -fi - -if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then - index_path=/project/shared/bicf_workflow_ref/${refgeno}/hisat_index/ -else - usage -fi - -if [[ -z $SLURM_CPUS_ON_NODE ]] -then - SLURM_CPUS_ON_NODE=1 -fi -module load hisat2/2.1.0-intel samtools/gcc/1.6 picard/2.10.3 -if [ $fq1 == $fq2 ] -then - hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/genome -U ${fq1} -S out.sam --summary-file ${pair_id}.alignerout.txt -else - hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/genome -1 ${fq1} -2 ${fq2} -S out.sam --summary-file ${pair_id}.alignerout.txt -fi -samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam -samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -n -o output.nsort.bam output.bam -java -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.nsort.bam O=${pair_id}.bam -samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam diff --git a/alignment/rnaseqalign.sh b/alignment/rnaseqalign.sh new file mode 100644 index 0000000000000000000000000000000000000000..320943492d353cc43025fff346980d55cacf9e34 --- /dev/null +++ b/alignment/rnaseqalign.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#rnaseqalign.sh + +usage() { + echo "-h Help documentation for hisat.sh" + echo "-r --Reference Genome: GRCh38 or GRCm38" + echo "-x --FastQ R1" + echo "-y --FastQ R2" + echo "-a --Method: hisat or star" + echo "-p --Prefix for output file name" + echo "Example: bash hisat.sh -p prefix -r GRCh38 -a hisat -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:a:x:y:p:h opt +do + case $opt in + r) refgeno=$OPTARG;; + x) fq1=$OPTARG;; + y) fq2=$OPTARG;; + a) algo=$OPTARG;; + p) pair_id=$OPTARG;; + h) usage;; + esac +done + +shift $(($OPTIND -1)) + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then + usage +fi + +if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then + index_path=/project/shared/bicf_workflow_ref/${refgeno} +else + usage +fi + +module load samtools/gcc/1.6 picard/2.10.3 +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi +if [ $algo == 'star' ] +then + if ($fq1 != $fq2) + then + module load star/2.4.2a + STAR --genomeDir ${index_path}/star_index/ --readFilesIn ${fq1} ${fq2} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out + else + STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out + fi + mv outLog.final.out ${pair_id}.alignerout.txt + mv outAligned.sortedByCoord.out.bam output.bam +else + module load hisat2/2.1.0-intel + if [ $fq1 == $fq2 ] + then + hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -U ${fq1} -S out.sam --summary-file ${pair_id}.alignerout.txt + else + hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -1 ${fq1} -2 ${fq2} -S out.sam --summary-file ${pair_id}.alignerout.txt + fi + samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam +fi +samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -n -o output.nsort.bam output.bam +java -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.nsort.bam O=${pair_id}.bam +samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam diff --git a/alignment/star_aligner.sh b/alignment/star_aligner.sh deleted file mode 100644 index f85f9da16723eb84f78f99ea029a5bc8ae2c0cc1..0000000000000000000000000000000000000000 --- a/alignment/star_aligner.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -#hisat.sh - -pair_id=$1 -index_path=$2 -fq1=$3 -fq2=$4 - -module load star/2.4.2a samtools/1.4.1 - -if ($fq1 != $fq2) -then - STAR --genomeDir ${index_path}/star_index/ --readFilesIn ${fq1} ${fq2} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out -else - STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out -fi -mv outLog.final.out ${pair_id}.alignerout.txt -samtools sort --threads \$SLURM_CPUS_ON_NODE -o ${pair_id}.bam outAligned.sortedByCoord.out.bam diff --git a/cnvkit.sh b/variants/cnvkit.sh similarity index 100% rename from cnvkit.sh rename to variants/cnvkit.sh diff --git a/variants/gatkrunner.sh b/variants/gatkrunner.sh new file mode 100644 index 0000000000000000000000000000000000000000..ee7f07dee43770f0bcc5e78f4a1ba48aea17ecec --- /dev/null +++ b/variants/gatkrunner.sh @@ -0,0 +1,61 @@ +#!/bin/bash +#gatkrunner.sh + +usage() { + echo "-h Help documentation for hisat.sh" + echo "-r --Reference Genome: GRCh38 or GRCm38" + echo "-b --BAM File" + echo "-p --Prefix for output file name" + echo "-a --Algorithm/Command" + echo "Example: bash hisat.sh -p prefix -r GRCh38 -b File.bam" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:a:b:p:h opt +do + case $opt in + r) refgeno=$OPTARG;; + b) sbam=$OPTARG;; + p) pair_id=$OPTARG;; + a) algo=$OPTARG;; + h) usage;; + esac +done + +shift $(($OPTIND -1)) + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $bam ] || [[ -z $refgeno ]]]; then + usage +fi + +if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ] || [ $refgeno == 'GRCh37' ]; then + index_path=/project/shared/bicf_workflow_ref/${refgeno} +else + usage +fi + +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi + +dbsnp="${index_path}/Snp.vcf.gz" +knownindel="${index_path}/Snp.vcf.gz" +reffa="${index_path}/genome.fa" + +module load gatk/3.7 samtools/1.6 + +if [[ $algo == 'gatkbam' ]] +then + samtools index ${sbam} + java -Xmx32g -jar $GATK_JAR -T RealignerTargetCreator -known ${knownindel} -R ${reffa} -o ${pair_id}.bam.list -I ${sbam} -nt $SLURM_CPUS_ON_NODE -nct 1 + java -Xmx32g -jar $GATK_JAR -I ${sbam} -R ${reffa} --filter_mismatching_base_and_quals -T IndelRealigner -targetIntervals ${pair_id}.bam.list -o ${pair_id}.realigned.bam + java -Xmx32g -jar $GATK_JAR -l INFO -R ${reffa} --knownSites ${dbsnp} -I ${pair_id}.realigned.bam -T BaseRecalibrator -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov ContextCovariate -o ${pair_id}.recal_data.grp -nt 1 -nct $SLURM_CPUS_ON_NODE + java -Xmx32g -jar $GATK_JAR -T PrintReads -R ${reffa} -I ${pair_id}.realigned.bam -BQSR ${pair_id}.recal_data.grp -o ${pair_id}.final.bam -nt 1 -nct 8 +elif +then + +else + +fi diff --git a/vcf/lca.R b/variants/lca.R similarity index 100% rename from vcf/lca.R rename to variants/lca.R diff --git a/vcf/parse_svresults.pl b/variants/parse_svresults.pl similarity index 100% rename from vcf/parse_svresults.pl rename to variants/parse_svresults.pl diff --git a/vcf/transvar2bed.pl b/variants/transvar2bed.pl similarity index 100% rename from vcf/transvar2bed.pl rename to variants/transvar2bed.pl diff --git a/vcf/uniform_integrated_vcf.pl b/variants/uniform_integrated_vcf.pl similarity index 100% rename from vcf/uniform_integrated_vcf.pl rename to variants/uniform_integrated_vcf.pl