diff --git a/alignment/add_umi_bam.py b/alignment/add_umi_bam.py new file mode 100644 index 0000000000000000000000000000000000000000..87cae3fd1c806d75a59dcf13a54e6af138ac40be --- /dev/null +++ b/alignment/add_umi_bam.py @@ -0,0 +1,29 @@ +#!/bin/env python +import sys +import pysam +import argparse +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-b', '--bam', + help="The bam file", + required=True) + parser.add_argument('-o', '--out', + help="The outfile", + default='outfile.bam') + args = parser.parse_args() + return args + +# set the tag names - take a look at SAM spec to pick an appropriate one +args = get_args() +infile = pysam.AlignmentFile(args.bam, "rb") +out = pysam.AlignmentFile(args.out, "wb", template=infile) +for read in infile.fetch(): + read.set_tag('RX', read.qname.split(":")[-1]) + out.write(read) + +infile.close() +out.close() diff --git a/alignment/hisat.sh b/alignment/hisat.sh index 2e0bdfb729c480f6a750eecce5367735c4500729..87af91b85d018f8a85baefe62c2c3f89198a1ea3 100644 --- a/alignment/hisat.sh +++ b/alignment/hisat.sh @@ -1,22 +1,52 @@ #!/bin/bash #hisat.sh -pair_id=$1 -index_path=$2 -fq1=$3 -fq2=$4 +usage() { + echo "-h Help documentation for hisat.sh" + echo "-r --Reference Genome: GRCh38 or GRCm38" + echo "-a --FastQ R1" + echo "-b --FastQ R2" + echo "-p --Prefix for output file name" + echo "Example: bash hisat.sh -p prefix -r GRCh38 -a SRR1551047_1.fastq.gz -b SRR1551047_2.fastq.gz" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:a:b:p:h opt +do + case $opt in + r) refgeno=$OPTARG;; + a) fq1=$OPTARG;; + b) fq2=$OPTARG;; + p) pair_id=$OPTARG;; + h) usage;; + esac +done + +shift $(($OPTIND -1)) + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then + usage +fi + +if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then + index_path=/project/shared/bicf_workflow_ref/${refgeno}/hisat_index/ +else + usage +fi + if [[ -z $SLURM_CPUS_ON_NODE ]] then SLURM_CPUS_ON_NODE=1 fi - -module load hisat2/2.1.0-intel samtools/1.4.1 +module load hisat2/2.1.0-intel samtools/gcc/1.6 picard/2.10.3 if [ $fq1 == $fq2 ] then - hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg '@RG\\tID:${pair_id}\\tLB:tx\\tPL:illumina\\tPU:barcode\\tSM:${pair_id}' --add-chrname --no-unal --dta -x ${index_path}/genome -U ${fq1} -S out.sam --summary-file ${pair_id}.alignerout.txt + hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/genome -U ${fq1} -S out.sam --summary-file ${pair_id}.alignerout.txt else - hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg '@RG\\tID:${pair_id}\\tLB:tx\\tPL:illumina\\tPU:barcode\\tSM:${pair_id}' --add-chrname --no-unal --dta -x ${index_path}/genome -1 ${fq1} -2 ${fq2} -S out.sam --summary-file ${pair_id}.alignerout.txt + hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/genome -1 ${fq1} -2 ${fq2} -S out.sam --summary-file ${pair_id}.alignerout.txt fi samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam -#fixmateinfomation -samtools sort --threads $SLURM_CPUS_ON_NODE -o ${pair_id}.bam output.bam +samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -n -o output.nsort.bam output.bam +java -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.nsort.bam O=${pair_id}.bam +samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam diff --git a/alignment/markdups.sh b/alignment/markdups.sh new file mode 100644 index 0000000000000000000000000000000000000000..4f44d3cb9d8fce34720d156f74b42d90e9f160b7 --- /dev/null +++ b/alignment/markdups.sh @@ -0,0 +1,69 @@ +#!/bin/bash +#hisat.sh + +usage() { + echo "-h --Help documentation for markdups.sh" + echo "-m --Mark Duplication Method: sambamba, samtools, picard, picard_umi, fgbio_umi, null; default is null" + echo "-b --BAM file" + echo "-p --Prefix for output file name" + echo "Example: bash markdups.sh -p prefix -b file.bam -a picard" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :a:b:p:h opt +do + case $opt in + a) algo=$OPTARG;; + b) sbam=$OPTARG;; + p) pair_id=$OPTARG;; + h) usage;; + esac +done + +shift $(($OPTIND -1)) + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $sbam ]]; then + usage +fi + +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi + +module load picard/2.10.3 samtools/1.6 + +if [ $algo == 'sambamba' ] +then + module load speedseq/20160506 + sambamba markdup -t $SLURM_CPUS_ON_NODE -r ${sbam} ${pair_id}.dedup.bam +elif [ $algo == 'samtools' ] +then + module load samtools/1.6 + samtools sort -n -@ $SLURM_CPUS_ON_NODE -o nsort.bam ${sbam} + samtools fixmate -c --output-fmt BAM -m -@ $SLURM_CPUS_ON_NODE nsort.bam fix.bam + samtools sort -n -@ $SLURM_CPUS_ON_NODE -o sort.bam fix.bam + samtools markdup -s --output-fmt BAM -@ $SLURM_CPUS_ON_NODE sort.bam ${pair_id}.dedup.bam +elif [ $algo == 'picard' ] +then + java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar MarkDuplicates I=${sbam} O=${prefix}.dedup.bam M=${pair_id}.dedup.stat.txt +elif [ $algo == 'picard_umi' ] +then + java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar MarkDuplicates BARCODE_TAG=RX I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt +elif [ $algo == 'fgbio_umi' ] +then + source activate fgbiotools + fgbio GroupReadsByUmi -s identity -i ${sbam} -o ${pair_id}.group.bam -m 10 + fgbio CallMolecularConsensusReads -i ${pair_id}.group.bam -p consensus -M 1 -o ${pair_id}.consensus.bam -S ':none:' + source deactivate + module load bwa/intel/0.7.15 + samtools index ${pair_id}.consensus.bam + samtools fastq -1 ${pair_id}.consensus.R1.fastq -2 ${pair_id}.consensus.R2.fastq ${pair_id}.consensus.bam + gzip ${pair_id}.consensus.R1.fastq + gzip ${pair_id}.consensus.R2.fastq + bwa mem -M -C -t 2 -R '@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}' /project/shared/bicf_workflow_ref/GRCh38/genome.fa ${pair_id}.consensus.R1.fastq.gz ${pair_id}.consensus.R2.fastq.gz | samtools view -1 - > ${pair_id}.consensus.bam + samtools sort --threads 10 -o ${pair_id}.dedup.bam ${pair_id}.consensus.bam +else + cp ${sbam} ${prefix}.dedup.bam +fi