From 3a071c5407134f4a911fcfaf03959e4c52b1259e Mon Sep 17 00:00:00 2001 From: Brandi Cantarel <brandi.cantarel@utsouthwestern.edu> Date: Mon, 30 Oct 2017 15:16:03 -0500 Subject: [PATCH] adding variantcalling --- alignment/bamqc.sh | 10 +---- alignment/rnaseqalign.sh | 16 +++---- alignment/starfusion.sh | 6 +-- diff_exp/geneabundance.sh | 42 ++++++++++++++++++ variants/gatkrunner.sh | 61 +++++++++++++++++--------- variants/germline_vc.sh | 91 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 181 insertions(+), 45 deletions(-) create mode 100644 diff_exp/geneabundance.sh create mode 100644 variants/germline_vc.sh diff --git a/alignment/bamqc.sh b/alignment/bamqc.sh index 7c2b403..eb28f93 100644 --- a/alignment/bamqc.sh +++ b/alignment/bamqc.sh @@ -15,7 +15,7 @@ OPTIND=1 # Reset OPTIND while getopts :r:b:n:p:h opt do case $opt in - r) refgeno=$OPTARG;; + r) index_path=$OPTARG;; b) sbam=$OPTARG;; c) bed=$OPTARG;; y) nuctype=$OPTARG;; @@ -27,13 +27,7 @@ done shift $(($OPTIND -1)) # Check for mandatory options -if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then - usage -fi - -if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then - index_path=/project/shared/bicf_workflow_ref/${refgeno} -else +if [[ -z $pair_id ]] || [[ -z $sbam ]]; then usage fi diff --git a/alignment/rnaseqalign.sh b/alignment/rnaseqalign.sh index 3209434..d994edb 100644 --- a/alignment/rnaseqalign.sh +++ b/alignment/rnaseqalign.sh @@ -15,7 +15,7 @@ OPTIND=1 # Reset OPTIND while getopts :r:a:x:y:p:h opt do case $opt in - r) refgeno=$OPTARG;; + r) index_path=$OPTARG;; x) fq1=$OPTARG;; y) fq2=$OPTARG;; a) algo=$OPTARG;; @@ -31,12 +31,6 @@ if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then usage fi -if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then - index_path=/project/shared/bicf_workflow_ref/${refgeno} -else - usage -fi - module load samtools/gcc/1.6 picard/2.10.3 if [[ -z $SLURM_CPUS_ON_NODE ]] then @@ -47,9 +41,9 @@ then if ($fq1 != $fq2) then module load star/2.4.2a - STAR --genomeDir ${index_path}/star_index/ --readFilesIn ${fq1} ${fq2} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out + STAR --genomeDir ${index_path}/star_index/ --readFilesIn $fq1 $fq2 --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out else - STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out + STAR --genomeDir ${index_path}/star_index/ --readFilesIn $fq1 --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out fi mv outLog.final.out ${pair_id}.alignerout.txt mv outAligned.sortedByCoord.out.bam output.bam @@ -57,9 +51,9 @@ else module load hisat2/2.1.0-intel if [ $fq1 == $fq2 ] then - hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -U ${fq1} -S out.sam --summary-file ${pair_id}.alignerout.txt + hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -U $fq1 -S out.sam --summary-file ${pair_id}.alignerout.txt else - hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -1 ${fq1} -2 ${fq2} -S out.sam --summary-file ${pair_id}.alignerout.txt + hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -1 $fq1 -2 $fq2 -S out.sam --summary-file ${pair_id}.alignerout.txt fi samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam fi diff --git a/alignment/starfusion.sh b/alignment/starfusion.sh index b2b0acd..cad5ba9 100644 --- a/alignment/starfusion.sh +++ b/alignment/starfusion.sh @@ -29,11 +29,7 @@ if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then usage fi -if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then - index_path=/project/shared/bicf_workflow_ref/${refgeno}/CTAT_lib/ -else - usage -fi +index_path=${refgeno}/CTAT_lib/ module add python/2.7.x-anaconda star/2.5.2b STAR-Fusion --genome_lib_dir ${index_path} --left_fq ${fq1} --right_fq ${fq2} --output_dir star_fusion &> star_fusion.err diff --git a/diff_exp/geneabundance.sh b/diff_exp/geneabundance.sh new file mode 100644 index 0000000..5385804 --- /dev/null +++ b/diff_exp/geneabundance.sh @@ -0,0 +1,42 @@ +#!/bin/bash +#rnaseqalign.sh + +usage() { + echo "-h Help documentation for hisat.sh" + echo "-b --BAM File" + echo "-g --GTF File" + echo "-s --stranded" + echo "-p --Prefix for output file name" + echo "Example: bash geneabudance.sh genect_rnaseq/geneabundance.sh -s stranded -g gtf_file -p pair_id -b bam" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :b:g:p:s:h opt +do + case $opt in + b) sbam=$OPTARG;; + g) gtf=$OPTARG;; + p) pair_id=$OPTARG;; + s) stranded=$OPTARG;; + h) usage;; + esac +done + +shift $(($OPTIND -1)) + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $sbam ]] +then + usage +fi +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi +module load subread/1.5.0-intel stringtie/1.1.2-intel +featureCounts -s $stranded -T $SLURM_CPUS_ON_NODE -p -g gene_name -a ${gtf} -o ${pair_id}.cts ${sbam} + +mkdir ${pair_id}_stringtie +cd ${pair_id}_stringtie +stringtie ../${sbam} -p $SLURM_CPUS_ON_NODE -G ../${gtf} -B -e -o denovo.gtf -A ../${pair_id}.fpkm.txt + \ No newline at end of file diff --git a/variants/gatkrunner.sh b/variants/gatkrunner.sh index ee7f07d..077ce71 100644 --- a/variants/gatkrunner.sh +++ b/variants/gatkrunner.sh @@ -2,7 +2,7 @@ #gatkrunner.sh usage() { - echo "-h Help documentation for hisat.sh" + echo "-h Help documentation for gatkrunner.sh" echo "-r --Reference Genome: GRCh38 or GRCm38" echo "-b --BAM File" echo "-p --Prefix for output file name" @@ -14,7 +14,7 @@ OPTIND=1 # Reset OPTIND while getopts :r:a:b:p:h opt do case $opt in - r) refgeno=$OPTARG;; + r) index_path=$OPTARG;; b) sbam=$OPTARG;; p) pair_id=$OPTARG;; a) algo=$OPTARG;; @@ -25,13 +25,8 @@ done shift $(($OPTIND -1)) # Check for mandatory options -if [[ -z $pair_id ]] || [[ -z $bam ] || [[ -z $refgeno ]]]; then - usage -fi - -if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ] || [ $refgeno == 'GRCh37' ]; then - index_path=/project/shared/bicf_workflow_ref/${refgeno} -else +if [[ -z $pair_id ]] || [[ -z $sbam ]] || [[ -z $index_path ]] +then usage fi @@ -39,23 +34,47 @@ if [[ -z $SLURM_CPUS_ON_NODE ]] then SLURM_CPUS_ON_NODE=1 fi - -dbsnp="${index_path}/Snp.vcf.gz" -knownindel="${index_path}/Snp.vcf.gz" -reffa="${index_path}/genome.fa" +if [[ -a "${index_path}/dbSnp.vcf.gz" ]] +then + dbsnp="${index_path}/dbSnp.vcf.gz" +else + echo "Missing dbSNP File: ${index_path}/dbSnp.vcf.gz" + usage +fi +if [[ -a "${index_path}/GoldIndels.vcf.gz" ]] +then + knownindel="${index_path}/GoldIndels.vcf.gz" +else + echo "Missing InDel File: ${index_path}/GoldIndels.vcf.gz" + usage +fi +if [[ -a "${index_path}/genome.fa" ]] +then + reffa="${index_path}/genome.fa" +else + echo "Missing Fasta File: ${index_path}/genome.fa" + usage +fi module load gatk/3.7 samtools/1.6 +samtools index -@ $SLURM_CPUS_ON_NODE ${sbam} -if [[ $algo == 'gatkbam' ]] +if [[ $algo == 'gatkbam_rna' ]] +then + module load picard/2.10.3 + java -Xmx4g -jar $PICARD/picard.jar CleanSam INPUT=${sbam} OUTPUT=${pair_id}.clean.bam + #java -Xmx4g -jar $PICARD/picard.jar AddOrReplaceReadGroups INPUT=${pair_id}.clean.bam O=${pair_id}.rg_added_sorted.bam SO=coordinate RGID=${pair_id} RGLB=tx RGPL=illumina RGPU=barcode RGSM=${pair_id} + samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.clean.bam + java -Xmx4g -jar $GATK_JAR -T SplitNCigarReads -R ${reffa} -I ${pair_id}.clean.bam -o ${pair_id}.split.bam -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS + java -Xmx32g -jar $GATK_JAR -T RealignerTargetCreator -known ${knownindel} -R ${reffa} -o ${pair_id}.bam.list -I ${pair_id}.split.bam -nt $SLURM_CPUS_ON_NODE -nct 1 + java -Xmx32g -jar $GATK_JAR -I ${pair_id}.split.bam -R ${reffa} --filter_mismatching_base_and_quals -T IndelRealigner -targetIntervals ${pair_id}.bam.list -o ${pair_id}.realigned.bam + java -Xmx32g -jar $GATK_JAR -l INFO -R ${reffa} --knownSites ${dbsnp} -I ${pair_id}.realigned.bam -T BaseRecalibrator -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov ContextCovariate -o ${pair_id}.recal_data.grp -nt 1 -nct $SLURM_CPUS_ON_NODE + java -Xmx32g -jar $GATK_JAR -T PrintReads -R ${reffa} -I ${pair_id}.realigned.bam -BQSR ${pair_id}.recal_data.grp -o ${pair_id}.final.bam -nt 1 -nct $SLURM_CPUS_ON_NODE +elif [[ $algo == 'gatkbam' ]] then - samtools index ${sbam} java -Xmx32g -jar $GATK_JAR -T RealignerTargetCreator -known ${knownindel} -R ${reffa} -o ${pair_id}.bam.list -I ${sbam} -nt $SLURM_CPUS_ON_NODE -nct 1 java -Xmx32g -jar $GATK_JAR -I ${sbam} -R ${reffa} --filter_mismatching_base_and_quals -T IndelRealigner -targetIntervals ${pair_id}.bam.list -o ${pair_id}.realigned.bam java -Xmx32g -jar $GATK_JAR -l INFO -R ${reffa} --knownSites ${dbsnp} -I ${pair_id}.realigned.bam -T BaseRecalibrator -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov ContextCovariate -o ${pair_id}.recal_data.grp -nt 1 -nct $SLURM_CPUS_ON_NODE - java -Xmx32g -jar $GATK_JAR -T PrintReads -R ${reffa} -I ${pair_id}.realigned.bam -BQSR ${pair_id}.recal_data.grp -o ${pair_id}.final.bam -nt 1 -nct 8 -elif -then - -else - + java -Xmx32g -jar $GATK_JAR -T PrintReads -R ${reffa} -I ${pair_id}.realigned.bam -BQSR ${pair_id}.recal_data.grp -o ${pair_id}.final.bam -nt 1 -nct $SLURM_CPUS_ON_NODE fi + diff --git a/variants/germline_vc.sh b/variants/germline_vc.sh new file mode 100644 index 0000000..43daaea --- /dev/null +++ b/variants/germline_vc.sh @@ -0,0 +1,91 @@ +usage() { + echo "-h Help documentation for gatkrunner.sh" + echo "-r --Reference Genome: GRCh38 or GRCm38" + echo "-p --Prefix for output file name" + echo "-a --Algorithm/Command" + echo "Example: bash hisat.sh -p prefix -r GRCh38" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:a:b:p:h opt +do + case $opt in + r) refgeno=$OPTARG;; + p) pair_id=$OPTARG;; + a) algo=$OPTARG;; + h) usage;; + esac +done +function join_by { local IFS="$1"; shift; echo "$*"; } +shift $(($OPTIND -1)) + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $bam ] || [[ -z $refgeno ]]]; then + usage +fi +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi +if [[ -a "${index_path}/dbSnp.vcf.gz" ]] +then + dbsnp="${index_path}/dbSnp.vcf.gz" +else + echo "Missing dbSNP File: ${index_path}/dbSnp.vcf.gz" + usage +fi +if [[ -a "${index_path}/GoldIndels.vcf.gz" ]] +then + knownindel="${index_path}/GoldIndels.vcf.gz" +else + echo "Missing InDel File: ${index_path}/GoldIndels.vcf.gz" + usage +fi +if [[ -a "${index_path}/genome.fa" ]] +then + reffa="${index_path}/genome.fa" +else + echo "Missing Fasta File: ${index_path}/genome.fa" + usage +fi +module load python/2.7.x-anaconda samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 + +for i in *.bam; do + samtools index -@ $SLURM_CPUS_ON_NODE $i +done + +if [[ $algo == 'mpileup' ]] +then + samtools mpileup -t 'AD,DP,INFO/AD' -ug -Q20 -C50 -f ${reffa} *.bam | bcftools call -vmO z -o ${pair_id}.sam.ori.vcf.gz + vcf-sort ${pair_id}.sam.ori.vcf.gz | vcf-annotate -n --fill-type | bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.sam.vcf.gz - +elif [[ $algo == 'hotspot' ]] +then + samtools mpileup -d 99999 -t 'AD,DP,INFO/AD' -uf ${reffa} *.bam > ${pair_id}.mpi + bcftools filter -i "AD[1]/DP > 0.01" ${pair_id}.mpi | bcftools filter -i "DP > 50" | bcftools call -m -A |vcf-annotate -n --fill-type | bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.lowfreq.vcf.gz - + java -jar $SNPEFF_HOME/SnpSift.jar annotate ${index_path}/cosmic.vcf.gz ${pair_id}.lowfreq.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar filter "(CNT[*] >0)" - |bgzip > ${pair_id}.hotspot.vcf.gz +elif +then [[ $algo == 'speedseq' ]] + module load speedseq/20160506 + speedseq var -t $SLURM_CPUS_ON_NODE -o ssvar ${reffa} *.bam + vcf-annotate -n --fill-type ssvar.vcf.gz| bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.ssvar.vcf.gz - +elif [[ $algo == 'gatk' ]] +then + module load gatk/3.7 + $gvcflist = '' + for i in *.bam; do + java -Djava.io.tmpdir=./ -Xmx32g -jar $GATK_JAR -R ${reffa} -D ${dbsnp} -T HaplotypeCaller -stand_call_conf 30 -stand_emit_conf 10.0 -A FisherStrand -A QualByDepth -A VariantType -A DepthPerAlleleBySample -A HaplotypeScore -A AlleleBalance -variant_index_type LINEAR -variant_index_parameter 128000 --emitRefConfidence GVCF -I $i -o ${i}.gatk.g.vcf -nct 2 & + $gvcflist += "--variant ${i}.gatk.g.vcf " + done + wait + java -Djava.io.tmpdir=./ -Xmx32g -jar $GATK_JAR -R ${reffa} -D ${dbsnp} -T GenotypeGVCFs -o gatk.vcf -nt $SLURM_CPUS_ON_NODE $gvcflist + vcf-annotate -n --fill-type gatk.vcf | bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.gatk.vcf.gz - + tabix ${pair_id}.gatk.vcf.gz +elif +then [[ $algo == 'platypus' ]] + module load platypus/gcc/0.8.1 + $bamlist = join_by , *.bam + Platypus.py callVariants --minMapQual=10 --mergeClusteredVariants=1 --nCPU=$SLURM_CPUS_ON_NODE --bamFiles=${bamlist} --refFile=${reffa} --output=platypus.vcf + vcf-sort platypus.vcf |vcf-annotate -n --fill-type -n |bgzip > platypus.vcf.gz + tabix platypus.vcf.gz + bcftools norm -c s -f ${reffa} -w 10 -O z -o ${i}.pl.vcf.gz platypus.vcf.gz +fi -- GitLab