upload alignment processes

9e9fa0e7 · Brandi Cantarel · 3090fd6a · 9e9fa0e7 · 9e9fa0e7 · 9e9fa0e7
Commit 9e9fa0e7 authored 7 years ago by Brandi Cantarel
--- a/alignment/bamqc.sh
+++ b/alignment/bamqc.sh
 #!/bin/bash
 #trimgalore.sh

-pair_id=$1
-bam=$2
+usage() {
+  echo "-h Help documentation for hisat.sh"
+  echo "-r  --Reference Genome: GRCh38 or GRCm38"
+  echo "-b  --BAM File"
+  echo "-n  --NucType"
+  echo "-p  --Prefix for output file name"
+  echo "-c  --Capture Bedfile"
+  echo "Example: bash hisat.sh -p prefix -r GRCh38 -a hisat -x SRR1551047_1.fastq.gz  -y SRR1551047_2.fastq.gz"
+  exit 1
+}
+OPTIND=1 # Reset OPTIND
+while getopts :r:b:n:p:h opt
+do
+    case $opt in
+        r) refgeno=$OPTARG;;
+        b) sbam=$OPTARG;;
+        c) bed=$OPTARG;;
+        y) nuctype=$OPTARG;;
+        p) pair_id=$OPTARG;;
+        h) usage;;
+    esac
+done

-module load samtools/1.4.1 fastqc/0.11.5
-samtools flagstat ${bam} > ${pair_id}.flagstat.txt
-fastqc -f bam ${bam}
+shift $(($OPTIND -1))
+
+# Check for mandatory options
+if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
+    usage
+fi
+
+if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then
+    index_path=/project/shared/bicf_workflow_ref/${refgeno}
+else
+    usage
+fi
+
+module load samtools/1.6 fastqc/0.11.5
+samtools flagstat ${sbam} > ${pair_id}.flagstat.txt
+fastqc -f bam ${sbam}
+
+if [[ $nuctype == 'dna' ]]; then
+    module load bedtools/2.26.0 picard/2.10.3
+    samtools view -b --threads $SLURM_CPUS_ON_NODE -L ${bed} -o ${pair_id}.ontarget.bam ${sbam}
+    samtools index ${pair_id}.ontarget.bam
+    samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
+    samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed}  >  ${pair_id}.mapqualcov.txt
+    samtools view -b -F 1024 ${pair_id}.ontarget.bam | bedtools coverage -sorted -g  ${index_path}/genomefile.txt -a ${bed} -b stdin -hist | grep ^all > ${pair_id}.dedupcov.txt 
+    java -Xmx32g -jar \$PICARD/picard.jar CollectAlignmentSummaryMetrics R=${reffa} I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
+    java -Xmx64g -jar \$PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt
+    samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=\$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
+    java -Xmx4g -jar \$PICARD/picard.jar CollectInsertSizeMetrics INPUT=${pair_id}.bam HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${reffa} OUTPUT=${pair_id}.hist.txt
+    samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed}  >  ${pair_id}.mapqualcov.txt
+    bedtools coverage -sorted -g  ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt
+    perl $baseDir/scripts/calculate_depthcov.pl ${pair_id}.covhist.txt
+    grep ^all ${pair_id}.covhist.txt >  ${pair_id}.genomecov.txt
+ fi
--- a/alignment/parse_seqqc.pl
+++ b/alignment/parse_seqqc.pl
--- a/alignment/parse_flagstat.pl
+++ b/alignment/parse_flagstat.pl
--- a/alignment/dnaseqalign.sh
+++ b/alignment/dnaseqalign.sh
+#!/bin/bash
+#dnaseqalign.sh
+
+usage() {
+  echo "-h Help documentation for hisat.sh"
+  echo "-r  --Reference Genome: GRCh38 or GRCm38"
+  echo "-x  --FastQ R1"
+  echo "-y  --FastQ R2"
+  echo "-p  --Prefix for output file name"
+  echo "Example: bash hisat.sh -p prefix -r GRCh38 -x SRR1551047_1.fastq.gz  -y SRR1551047_2.fastq.gz"
+  exit 1
+}
+OPTIND=1 # Reset OPTIND
+while getopts :r:a:x:y:p:h opt
+do
+    case $opt in
+        r) refgeno=$OPTARG;;
+        x) fq1=$OPTARG;;
+        y) fq2=$OPTARG;;
+	a) algo=$OPTARG;;
+        p) pair_id=$OPTARG;;
+        h) usage;;
+    esac
+done
+
+shift $(($OPTIND -1))
+
+# Check for mandatory options
+if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
+    usage
+fi
+
+if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ] || [ $refgeno == 'GRCh37' ]; then
+    index_path=/project/shared/bicf_workflow_ref/${refgeno}
+else
+    usage
+fi
+
+if [[ -z $SLURM_CPUS_ON_NODE ]]
+then
+    SLURM_CPUS_ON_NODE=1
+fi
+
+module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3 speedseq/20160506
+if ($fq1 != $fq2)
+then
+    bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam
+else
+    bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam
+fi
+if [ $refgeno == 'GRCh38' ]
+then
+    cat out.sam | k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt | samtools view -1 - > output.unsort.bam
+    run-HLA ${pair_id}.hla > ${pair_id}.hla.top 2> ${pair_id}.log.hla
+    touch ${pair_id}.hla.HLA-dummy.gt
+    cat ${pair_id}.hla.HLA*.gt | grep ^GT | cut -f2- > ${pair_id}.hla.all
+else 
+    samtools view -1 -o output.unsort.bam out.sam
+fi 
+samtools sort --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam
+samtools view -h output.unsort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam
+gawk '{ if (\$0~"^@") { print; next } else { \$10="*"; \$11="*"; print } }' OFS="\\t" splitters.sam | samtools  view -S -b - | samtools sort -o ${pair_id}.splitters.bam -
+gawk '{ if (\$0~"^@") { print; next } else { \$10="*"; \$11="*"; print } }' OFS="\\t" discordants.sam | samtools  view -S  -b - | samtools sort -o ${pair_id}.discordants.bam -
+java -Djava.io.tmpdir=./ -Xmx4g  -jar \$PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.dups.bam O=${pair_id}.bam
+samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
--- a/alignment/hisat.sh
+++ b/alignment/hisat.sh
-#!/bin/bash
-#hisat.sh
-
-usage() {
-  echo "-h Help documentation for hisat.sh"
-  echo "-r  --Reference Genome: GRCh38 or GRCm38"
-  echo "-a  --FastQ R1"
-  echo "-b  --FastQ R2"
-  echo "-p  --Prefix for output file name"
-  echo "Example: bash hisat.sh -p prefix -r GRCh38 -a SRR1551047_1.fastq.gz  -b SRR1551047_2.fastq.gz"
-  exit 1
-}
-OPTIND=1 # Reset OPTIND
-while getopts :r:a:b:p:h opt
-do
-    case $opt in
-        r) refgeno=$OPTARG;;
-        a) fq1=$OPTARG;;
-        b) fq2=$OPTARG;;
-        p) pair_id=$OPTARG;;
-        h) usage;;
-    esac
-done
-
-shift $(($OPTIND -1))
-
-# Check for mandatory options
-if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
-    usage
-fi
-
-if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then
-    index_path=/project/shared/bicf_workflow_ref/${refgeno}/hisat_index/
-else
-    usage
-fi
-
-if [[ -z $SLURM_CPUS_ON_NODE ]]
-then
-    SLURM_CPUS_ON_NODE=1
-fi
-module load hisat2/2.1.0-intel samtools/gcc/1.6 picard/2.10.3
-if [ $fq1 == $fq2 ]
-then
-    hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/genome -U ${fq1} -S out.sam --summary-file ${pair_id}.alignerout.txt
-else
-    hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/genome -1 ${fq1} -2 ${fq2} -S out.sam --summary-file ${pair_id}.alignerout.txt
-fi
-samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam
-samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -n -o  output.nsort.bam output.bam
-java -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.nsort.bam O=${pair_id}.bam
-samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
--- a/alignment/rnaseqalign.sh
+++ b/alignment/rnaseqalign.sh
+#!/bin/bash
+#rnaseqalign.sh
+
+usage() {
+  echo "-h Help documentation for hisat.sh"
+  echo "-r  --Reference Genome: GRCh38 or GRCm38"
+  echo "-x  --FastQ R1"
+  echo "-y  --FastQ R2"
+  echo "-a  --Method: hisat or star"
+  echo "-p  --Prefix for output file name"
+  echo "Example: bash hisat.sh -p prefix -r GRCh38 -a hisat -x SRR1551047_1.fastq.gz  -y SRR1551047_2.fastq.gz"
+  exit 1
+}
+OPTIND=1 # Reset OPTIND
+while getopts :r:a:x:y:p:h opt
+do
+    case $opt in
+        r) refgeno=$OPTARG;;
+        x) fq1=$OPTARG;;
+        y) fq2=$OPTARG;;
+	a) algo=$OPTARG;;
+        p) pair_id=$OPTARG;;
+        h) usage;;
+    esac
+done
+
+shift $(($OPTIND -1))
+
+# Check for mandatory options
+if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
+    usage
+fi
+
+if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then
+    index_path=/project/shared/bicf_workflow_ref/${refgeno}
+else
+    usage
+fi
+
+module load  samtools/gcc/1.6 picard/2.10.3
+if [[ -z $SLURM_CPUS_ON_NODE ]]
+then
+    SLURM_CPUS_ON_NODE=1
+fi
+if [ $algo == 'star' ]
+then
+    if ($fq1 != $fq2)
+    then
+	module load star/2.4.2a
+	STAR --genomeDir ${index_path}/star_index/ --readFilesIn ${fq1} ${fq2} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out
+    else
+	STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out
+    fi
+    mv outLog.final.out ${pair_id}.alignerout.txt
+    mv outAligned.sortedByCoord.out.bam output.bam
+else
+    module load hisat2/2.1.0-intel
+    if [ $fq1 == $fq2 ]
+    then
+	hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -U ${fq1} -S out.sam --summary-file ${pair_id}.alignerout.txt
+    else
+	hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/hisat_index/genome -1 ${fq1} -2 ${fq2} -S out.sam --summary-file ${pair_id}.alignerout.txt
+    fi
+    samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam
+fi
+samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -n -o  output.nsort.bam output.bam
+java -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.nsort.bam O=${pair_id}.bam
+samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
--- a/alignment/star_aligner.sh
+++ b/alignment/star_aligner.sh
-#!/bin/bash
-#hisat.sh
-
-pair_id=$1
-index_path=$2
-fq1=$3
-fq2=$4
-
-module load star/2.4.2a samtools/1.4.1
-
-if ($fq1 != $fq2)
-then
-    STAR --genomeDir ${index_path}/star_index/ --readFilesIn ${fq1} ${fq2} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out
-else
-    STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out
-fi
-mv outLog.final.out ${pair_id}.alignerout.txt
-samtools sort --threads \$SLURM_CPUS_ON_NODE -o ${pair_id}.bam outAligned.sortedByCoord.out.bam
--- a/cnvkit.sh
+++ b/cnvkit.sh
--- a/variants/gatkrunner.sh
+++ b/variants/gatkrunner.sh
+#!/bin/bash
+#gatkrunner.sh
+
+usage() {
+  echo "-h Help documentation for hisat.sh"
+  echo "-r  --Reference Genome: GRCh38 or GRCm38"
+  echo "-b  --BAM File"
+  echo "-p  --Prefix for output file name"
+  echo "-a  --Algorithm/Command"
+  echo "Example: bash hisat.sh -p prefix -r GRCh38 -b File.bam"
+  exit 1
+}
+OPTIND=1 # Reset OPTIND
+while getopts :r:a:b:p:h opt
+do
+    case $opt in
+        r) refgeno=$OPTARG;;
+        b) sbam=$OPTARG;;
+        p) pair_id=$OPTARG;;
+        a) algo=$OPTARG;;
+        h) usage;;
+    esac
+done
+
+shift $(($OPTIND -1))
+
+# Check for mandatory options
+if [[ -z $pair_id ]] || [[ -z $bam ] || [[ -z $refgeno ]]]; then
+    usage
+fi
+
+if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ] || [ $refgeno == 'GRCh37' ]; then
+    index_path=/project/shared/bicf_workflow_ref/${refgeno}
+else
+    usage
+fi
+
+if [[ -z $SLURM_CPUS_ON_NODE ]]
+then
+    SLURM_CPUS_ON_NODE=1
+fi
+
+dbsnp="${index_path}/Snp.vcf.gz"
+knownindel="${index_path}/Snp.vcf.gz"
+reffa="${index_path}/genome.fa"
+
+module load gatk/3.7 samtools/1.6
+
+if [[ $algo == 'gatkbam' ]]
+then
+    samtools index ${sbam}
+  java -Xmx32g -jar $GATK_JAR -T RealignerTargetCreator -known ${knownindel} -R ${reffa} -o ${pair_id}.bam.list -I ${sbam} -nt $SLURM_CPUS_ON_NODE -nct 1
+  java -Xmx32g -jar $GATK_JAR -I ${sbam} -R ${reffa} --filter_mismatching_base_and_quals -T IndelRealigner -targetIntervals ${pair_id}.bam.list -o ${pair_id}.realigned.bam
+  java -Xmx32g -jar $GATK_JAR -l INFO -R ${reffa} --knownSites ${dbsnp} -I ${pair_id}.realigned.bam -T BaseRecalibrator -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov ContextCovariate -o ${pair_id}.recal_data.grp -nt 1 -nct $SLURM_CPUS_ON_NODE
+  java -Xmx32g -jar $GATK_JAR -T PrintReads -R ${reffa} -I ${pair_id}.realigned.bam -BQSR ${pair_id}.recal_data.grp -o ${pair_id}.final.bam -nt 1 -nct 8
+elif
+then
+
+else
+
+fi
--- a/vcf/lca.R
+++ b/vcf/lca.R
--- a/vcf/parse_svresults.pl
+++ b/vcf/parse_svresults.pl
--- a/vcf/transvar2bed.pl
+++ b/vcf/transvar2bed.pl
--- a/vcf/uniform_integrated_vcf.pl
+++ b/vcf/uniform_integrated_vcf.pl