adding variantcalling

3a071c54 · Brandi Cantarel · f649f687 · 3a071c54 · 3a071c54 · 3a071c54
Commit 3a071c54 authored 7 years ago by Brandi Cantarel
--- a/alignment/bamqc.sh
+++ b/alignment/bamqc.sh
@@ -15,7 +15,7 @@ OPTIND=1 # Reset OPTIND
 while getopts :r:b:n:p:h opt
 do
    case $opt in
-        r) refgeno=$OPTARG;;
+        r) index_path=$OPTARG;;
        b) sbam=$OPTARG;;
        c) bed=$OPTARG;;
        y) nuctype=$OPTARG;;
@@ -27,13 +27,7 @@ done
 shift $(($OPTIND -1))

 # Check for mandatory options
-if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
-    usage
-fi
-
-if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then
-    index_path=/project/shared/bicf_workflow_ref/${refgeno}
-else
+if [[ -z $pair_id ]] || [[ -z $sbam ]]; then
    usage
 fi


--- a/alignment/rnaseqalign.sh
+++ b/alignment/rnaseqalign.sh
@@ -15,7 +15,7 @@ OPTIND=1 # Reset OPTIND
 while getopts :r:a:x:y:p:h opt
 do
    case $opt in
-        r) refgeno=$OPTARG;;
+        r) index_path=$OPTARG;;
        x) fq1=$OPTARG;;
        y) fq2=$OPTARG;;
 	a) algo=$OPTARG;;
@@ -31,12 +31,6 @@ if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
    usage
 fi

-if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then
-    index_path=/project/shared/bicf_workflow_ref/${refgeno}
-else
-    usage
-fi
-
 module load  samtools/gcc/1.6 picard/2.10.3
 if [[ -z $SLURM_CPUS_ON_NODE ]]
 then
@@ -47,9 +41,9 @@ then
    if ($fq1 != $fq2)
    then
 	module load star/2.4.2a
-	STAR --genomeDir ${index_path}/star_index/ --readFilesIn ${fq1} ${fq2} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out
+	STAR --genomeDir ${index_path}/star_index/ --readFilesIn $fq1 $fq2 --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out
    else
-	STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out
+	STAR --genomeDir ${index_path}/star_index/ --readFilesIn $fq1 --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out
    fi
    mv outLog.final.out ${pair_id}.alignerout.txt
    mv outAligned.sortedByCoord.out.bam output.bam
@@ -57,9 +51,9 @@ else
    module load hisat2/2.1.0-intel
    if [ $fq1 == $fq2 ]
    then
-	hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -U ${fq1} -S out.sam --summary-file ${pair_id}.alignerout.txt
+	hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -U $fq1 -S out.sam --summary-file ${pair_id}.alignerout.txt
    else
-	hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -1 ${fq1} -2 ${fq2} -S out.sam --summary-file ${pair_id}.alignerout.txt
+	hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -1 $fq1 -2 $fq2 -S out.sam --summary-file ${pair_id}.alignerout.txt
    fi
    samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam
 fi

--- a/alignment/starfusion.sh
+++ b/alignment/starfusion.sh
@@ -29,11 +29,7 @@ if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
    usage
 fi

-if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then
-    index_path=/project/shared/bicf_workflow_ref/${refgeno}/CTAT_lib/
-else
-    usage
-fi
+index_path=${refgeno}/CTAT_lib/

 module add python/2.7.x-anaconda star/2.5.2b
 STAR-Fusion --genome_lib_dir ${index_path} --left_fq ${fq1} --right_fq ${fq2} --output_dir star_fusion &> star_fusion.err

--- a/diff_exp/geneabundance.sh
+++ b/diff_exp/geneabundance.sh
+#!/bin/bash
+#rnaseqalign.sh
+
+usage() {
+  echo "-h Help documentation for hisat.sh"
+  echo "-b  --BAM File"
+  echo "-g  --GTF File"
+  echo "-s  --stranded"
+  echo "-p  --Prefix for output file name"
+  echo "Example: bash geneabudance.sh genect_rnaseq/geneabundance.sh -s stranded -g gtf_file -p pair_id -b bam"
+  exit 1
+}
+OPTIND=1 # Reset OPTIND
+while getopts :b:g:p:s:h opt
+do
+    case $opt in
+        b) sbam=$OPTARG;;
+        g) gtf=$OPTARG;;
+        p) pair_id=$OPTARG;;
+        s) stranded=$OPTARG;;
+        h) usage;;
+    esac
+done
+
+shift $(($OPTIND -1))
+
+# Check for mandatory options
+if [[ -z $pair_id ]] || [[ -z $sbam ]]
+then
+    usage
+fi
+if [[ -z $SLURM_CPUS_ON_NODE ]]
+then
+    SLURM_CPUS_ON_NODE=1
+fi
+module load subread/1.5.0-intel stringtie/1.1.2-intel
+featureCounts -s $stranded -T $SLURM_CPUS_ON_NODE -p -g gene_name -a ${gtf} -o ${pair_id}.cts ${sbam}
+
+mkdir ${pair_id}_stringtie
+cd ${pair_id}_stringtie
+stringtie ../${sbam} -p $SLURM_CPUS_ON_NODE -G ../${gtf} -B -e -o denovo.gtf -A ../${pair_id}.fpkm.txt
+ 
\ No newline at end of file
--- a/variants/gatkrunner.sh
+++ b/variants/gatkrunner.sh
@@ -2,7 +2,7 @@
 #gatkrunner.sh

 usage() {
-  echo "-h Help documentation for hisat.sh"
+  echo "-h Help documentation for gatkrunner.sh"
  echo "-r  --Reference Genome: GRCh38 or GRCm38"
  echo "-b  --BAM File"
  echo "-p  --Prefix for output file name"
@@ -14,7 +14,7 @@ OPTIND=1 # Reset OPTIND
 while getopts :r:a:b:p:h opt
 do
    case $opt in
-        r) refgeno=$OPTARG;;
+        r) index_path=$OPTARG;;
        b) sbam=$OPTARG;;
        p) pair_id=$OPTARG;;
        a) algo=$OPTARG;;
@@ -25,13 +25,8 @@ done
 shift $(($OPTIND -1))

 # Check for mandatory options
-if [[ -z $pair_id ]] || [[ -z $bam ] || [[ -z $refgeno ]]]; then
-    usage
-fi
-
-if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ] || [ $refgeno == 'GRCh37' ]; then
-    index_path=/project/shared/bicf_workflow_ref/${refgeno}
-else
+if [[ -z $pair_id ]] || [[ -z $sbam ]] || [[ -z $index_path ]]
+then
    usage
 fi

@@ -39,23 +34,47 @@ if [[ -z $SLURM_CPUS_ON_NODE ]]
 then
    SLURM_CPUS_ON_NODE=1
 fi
-
-dbsnp="${index_path}/Snp.vcf.gz"
-knownindel="${index_path}/Snp.vcf.gz"
-reffa="${index_path}/genome.fa"
+if [[ -a "${index_path}/dbSnp.vcf.gz" ]]
+then
+    dbsnp="${index_path}/dbSnp.vcf.gz"
+else 
+    echo "Missing dbSNP File: ${index_path}/dbSnp.vcf.gz"
+    usage
+fi
+if [[ -a "${index_path}/GoldIndels.vcf.gz" ]]
+then
+    knownindel="${index_path}/GoldIndels.vcf.gz"
+else 
+    echo "Missing InDel File: ${index_path}/GoldIndels.vcf.gz"
+    usage
+fi
+if [[ -a "${index_path}/genome.fa" ]]
+then
+    reffa="${index_path}/genome.fa"
+else 
+    echo "Missing Fasta File: ${index_path}/genome.fa"
+    usage
+fi

 module load gatk/3.7 samtools/1.6
+samtools index -@ $SLURM_CPUS_ON_NODE ${sbam}

-if [[ $algo == 'gatkbam' ]]
+if [[ $algo == 'gatkbam_rna' ]]
+then
+    module load picard/2.10.3
+    java -Xmx4g -jar $PICARD/picard.jar CleanSam INPUT=${sbam} OUTPUT=${pair_id}.clean.bam
+    #java -Xmx4g -jar $PICARD/picard.jar AddOrReplaceReadGroups INPUT=${pair_id}.clean.bam O=${pair_id}.rg_added_sorted.bam SO=coordinate RGID=${pair_id} RGLB=tx RGPL=illumina RGPU=barcode RGSM=${pair_id}
+    samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.clean.bam
+    java -Xmx4g -jar $GATK_JAR -T SplitNCigarReads -R ${reffa} -I ${pair_id}.clean.bam -o ${pair_id}.split.bam -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS
+    java -Xmx32g -jar $GATK_JAR -T RealignerTargetCreator -known ${knownindel} -R ${reffa} -o ${pair_id}.bam.list -I ${pair_id}.split.bam -nt $SLURM_CPUS_ON_NODE -nct 1
+    java -Xmx32g -jar $GATK_JAR -I ${pair_id}.split.bam -R ${reffa} --filter_mismatching_base_and_quals -T IndelRealigner -targetIntervals ${pair_id}.bam.list -o ${pair_id}.realigned.bam
+    java -Xmx32g -jar $GATK_JAR -l INFO -R ${reffa} --knownSites ${dbsnp} -I ${pair_id}.realigned.bam -T BaseRecalibrator -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov ContextCovariate -o ${pair_id}.recal_data.grp -nt 1 -nct $SLURM_CPUS_ON_NODE
+    java -Xmx32g -jar $GATK_JAR -T PrintReads -R ${reffa} -I ${pair_id}.realigned.bam -BQSR ${pair_id}.recal_data.grp -o ${pair_id}.final.bam -nt 1 -nct $SLURM_CPUS_ON_NODE
+elif [[ $algo == 'gatkbam' ]]
 then
-    samtools index ${sbam}
  java -Xmx32g -jar $GATK_JAR -T RealignerTargetCreator -known ${knownindel} -R ${reffa} -o ${pair_id}.bam.list -I ${sbam} -nt $SLURM_CPUS_ON_NODE -nct 1
  java -Xmx32g -jar $GATK_JAR -I ${sbam} -R ${reffa} --filter_mismatching_base_and_quals -T IndelRealigner -targetIntervals ${pair_id}.bam.list -o ${pair_id}.realigned.bam
  java -Xmx32g -jar $GATK_JAR -l INFO -R ${reffa} --knownSites ${dbsnp} -I ${pair_id}.realigned.bam -T BaseRecalibrator -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov ContextCovariate -o ${pair_id}.recal_data.grp -nt 1 -nct $SLURM_CPUS_ON_NODE
-  java -Xmx32g -jar $GATK_JAR -T PrintReads -R ${reffa} -I ${pair_id}.realigned.bam -BQSR ${pair_id}.recal_data.grp -o ${pair_id}.final.bam -nt 1 -nct 8
-elif
-then
-
-else
-
+  java -Xmx32g -jar $GATK_JAR -T PrintReads -R ${reffa} -I ${pair_id}.realigned.bam -BQSR ${pair_id}.recal_data.grp -o ${pair_id}.final.bam -nt 1 -nct $SLURM_CPUS_ON_NODE
 fi
+
--- a/variants/germline_vc.sh
+++ b/variants/germline_vc.sh
+usage() {
+  echo "-h Help documentation for gatkrunner.sh"
+  echo "-r  --Reference Genome: GRCh38 or GRCm38"
+  echo "-p  --Prefix for output file name"
+  echo "-a  --Algorithm/Command"
+  echo "Example: bash hisat.sh -p prefix -r GRCh38"
+  exit 1
+}
+OPTIND=1 # Reset OPTIND
+while getopts :r:a:b:p:h opt
+do
+    case $opt in
+        r) refgeno=$OPTARG;;
+        p) pair_id=$OPTARG;;
+        a) algo=$OPTARG;;
+        h) usage;;
+    esac
+done
+function join_by { local IFS="$1"; shift; echo "$*"; }
+shift $(($OPTIND -1))
+
+# Check for mandatory options
+if [[ -z $pair_id ]] || [[ -z $bam ] || [[ -z $refgeno ]]]; then
+    usage
+fi
+if [[ -z $SLURM_CPUS_ON_NODE ]]
+then
+    SLURM_CPUS_ON_NODE=1
+fi
+if [[ -a "${index_path}/dbSnp.vcf.gz" ]]
+then
+    dbsnp="${index_path}/dbSnp.vcf.gz"
+else 
+    echo "Missing dbSNP File: ${index_path}/dbSnp.vcf.gz"
+    usage
+fi
+if [[ -a "${index_path}/GoldIndels.vcf.gz" ]]
+then
+    knownindel="${index_path}/GoldIndels.vcf.gz"
+else 
+    echo "Missing InDel File: ${index_path}/GoldIndels.vcf.gz"
+    usage
+fi
+if [[ -a "${index_path}/genome.fa" ]]
+then
+    reffa="${index_path}/genome.fa"
+else 
+    echo "Missing Fasta File: ${index_path}/genome.fa"
+    usage
+fi
+module load python/2.7.x-anaconda samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14
+
+for i in *.bam; do
+    samtools index -@ $SLURM_CPUS_ON_NODE $i
+done
+
+if [[ $algo == 'mpileup' ]]
+then
+    samtools mpileup -t 'AD,DP,INFO/AD' -ug -Q20 -C50 -f ${reffa} *.bam | bcftools call -vmO z -o ${pair_id}.sam.ori.vcf.gz
+    vcf-sort ${pair_id}.sam.ori.vcf.gz | vcf-annotate -n --fill-type | bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.sam.vcf.gz -
+elif [[ $algo == 'hotspot' ]]
+then
+    samtools mpileup -d 99999 -t 'AD,DP,INFO/AD' -uf ${reffa} *.bam > ${pair_id}.mpi
+    bcftools filter -i "AD[1]/DP > 0.01" ${pair_id}.mpi | bcftools filter -i "DP > 50" | bcftools call -m -A |vcf-annotate -n --fill-type |  bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.lowfreq.vcf.gz -
+    java -jar $SNPEFF_HOME/SnpSift.jar annotate ${index_path}/cosmic.vcf.gz ${pair_id}.lowfreq.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar filter "(CNT[*] >0)" - |bgzip > ${pair_id}.hotspot.vcf.gz
+elif
+then [[ $algo == 'speedseq' ]]
+    module load speedseq/20160506
+    speedseq var -t $SLURM_CPUS_ON_NODE -o ssvar ${reffa} *.bam
+    vcf-annotate -n --fill-type ssvar.vcf.gz| bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.ssvar.vcf.gz -
+elif [[ $algo == 'gatk' ]]
+then
+    module load gatk/3.7
+    $gvcflist = ''
+    for i in *.bam; do
+	java -Djava.io.tmpdir=./ -Xmx32g -jar $GATK_JAR -R ${reffa} -D ${dbsnp} -T HaplotypeCaller -stand_call_conf 30 -stand_emit_conf 10.0 -A FisherStrand -A QualByDepth -A VariantType -A DepthPerAlleleBySample -A HaplotypeScore -A AlleleBalance -variant_index_type LINEAR -variant_index_parameter 128000 --emitRefConfidence GVCF -I $i -o ${i}.gatk.g.vcf -nct 2 &
+	$gvcflist += "--variant ${i}.gatk.g.vcf "
+    done
+    wait
+    java -Djava.io.tmpdir=./ -Xmx32g -jar $GATK_JAR -R ${reffa} -D ${dbsnp} -T GenotypeGVCFs -o gatk.vcf -nt $SLURM_CPUS_ON_NODE $gvcflist
+    vcf-annotate -n --fill-type gatk.vcf | bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.gatk.vcf.gz -
+    tabix ${pair_id}.gatk.vcf.gz
+elif
+then [[ $algo == 'platypus' ]]
+    module load platypus/gcc/0.8.1
+    $bamlist = join_by , *.bam
+    Platypus.py callVariants --minMapQual=10 --mergeClusteredVariants=1 --nCPU=$SLURM_CPUS_ON_NODE --bamFiles=${bamlist} --refFile=${reffa} --output=platypus.vcf
+    vcf-sort platypus.vcf |vcf-annotate -n --fill-type -n |bgzip > platypus.vcf.gz
+    tabix platypus.vcf.gz
+    bcftools norm -c s -f ${reffa} -w 10 -O z -o ${i}.pl.vcf.gz platypus.vcf.gz
+fi