Skip to content
Snippets Groups Projects
Commit 15a6a639 authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

add markdups

parent eb46c4a5
Branches
Tags
No related merge requests found
#!/bin/env python
import sys
import pysam
import argparse
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-b', '--bam',
help="The bam file",
required=True)
parser.add_argument('-o', '--out',
help="The outfile",
default='outfile.bam')
args = parser.parse_args()
return args
# set the tag names - take a look at SAM spec to pick an appropriate one
args = get_args()
infile = pysam.AlignmentFile(args.bam, "rb")
out = pysam.AlignmentFile(args.out, "wb", template=infile)
for read in infile.fetch():
read.set_tag('RX', read.qname.split(":")[-1])
out.write(read)
infile.close()
out.close()
#!/bin/bash
#hisat.sh
pair_id=$1
index_path=$2
fq1=$3
fq2=$4
usage() {
echo "-h Help documentation for hisat.sh"
echo "-r --Reference Genome: GRCh38 or GRCm38"
echo "-a --FastQ R1"
echo "-b --FastQ R2"
echo "-p --Prefix for output file name"
echo "Example: bash hisat.sh -p prefix -r GRCh38 -a SRR1551047_1.fastq.gz -b SRR1551047_2.fastq.gz"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:a:b:p:h opt
do
case $opt in
r) refgeno=$OPTARG;;
a) fq1=$OPTARG;;
b) fq2=$OPTARG;;
p) pair_id=$OPTARG;;
h) usage;;
esac
done
shift $(($OPTIND -1))
# Check for mandatory options
if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
usage
fi
if [ $refgeno == 'GRCh38' ] || [ $refgeno == 'GRCm38' ]; then
index_path=/project/shared/bicf_workflow_ref/${refgeno}/hisat_index/
else
usage
fi
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
module load hisat2/2.1.0-intel samtools/1.4.1
module load hisat2/2.1.0-intel samtools/gcc/1.6 picard/2.10.3
if [ $fq1 == $fq2 ]
then
hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg '@RG\\tID:${pair_id}\\tLB:tx\\tPL:illumina\\tPU:barcode\\tSM:${pair_id}' --add-chrname --no-unal --dta -x ${index_path}/genome -U ${fq1} -S out.sam --summary-file ${pair_id}.alignerout.txt
hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/genome -U ${fq1} -S out.sam --summary-file ${pair_id}.alignerout.txt
else
hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg '@RG\\tID:${pair_id}\\tLB:tx\\tPL:illumina\\tPU:barcode\\tSM:${pair_id}' --add-chrname --no-unal --dta -x ${index_path}/genome -1 ${fq1} -2 ${fq2} -S out.sam --summary-file ${pair_id}.alignerout.txt
hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --add-chrname --no-unal --dta -x ${index_path}/genome -1 ${fq1} -2 ${fq2} -S out.sam --summary-file ${pair_id}.alignerout.txt
fi
samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam
#fixmateinfomation
samtools sort --threads $SLURM_CPUS_ON_NODE -o ${pair_id}.bam output.bam
samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -n -o output.nsort.bam output.bam
java -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.nsort.bam O=${pair_id}.bam
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
#!/bin/bash
#hisat.sh
usage() {
echo "-h --Help documentation for markdups.sh"
echo "-m --Mark Duplication Method: sambamba, samtools, picard, picard_umi, fgbio_umi, null; default is null"
echo "-b --BAM file"
echo "-p --Prefix for output file name"
echo "Example: bash markdups.sh -p prefix -b file.bam -a picard"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :a:b:p:h opt
do
case $opt in
a) algo=$OPTARG;;
b) sbam=$OPTARG;;
p) pair_id=$OPTARG;;
h) usage;;
esac
done
shift $(($OPTIND -1))
# Check for mandatory options
if [[ -z $pair_id ]] || [[ -z $sbam ]]; then
usage
fi
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
module load picard/2.10.3 samtools/1.6
if [ $algo == 'sambamba' ]
then
module load speedseq/20160506
sambamba markdup -t $SLURM_CPUS_ON_NODE -r ${sbam} ${pair_id}.dedup.bam
elif [ $algo == 'samtools' ]
then
module load samtools/1.6
samtools sort -n -@ $SLURM_CPUS_ON_NODE -o nsort.bam ${sbam}
samtools fixmate -c --output-fmt BAM -m -@ $SLURM_CPUS_ON_NODE nsort.bam fix.bam
samtools sort -n -@ $SLURM_CPUS_ON_NODE -o sort.bam fix.bam
samtools markdup -s --output-fmt BAM -@ $SLURM_CPUS_ON_NODE sort.bam ${pair_id}.dedup.bam
elif [ $algo == 'picard' ]
then
java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar MarkDuplicates I=${sbam} O=${prefix}.dedup.bam M=${pair_id}.dedup.stat.txt
elif [ $algo == 'picard_umi' ]
then
java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar MarkDuplicates BARCODE_TAG=RX I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt
elif [ $algo == 'fgbio_umi' ]
then
source activate fgbiotools
fgbio GroupReadsByUmi -s identity -i ${sbam} -o ${pair_id}.group.bam -m 10
fgbio CallMolecularConsensusReads -i ${pair_id}.group.bam -p consensus -M 1 -o ${pair_id}.consensus.bam -S ':none:'
source deactivate
module load bwa/intel/0.7.15
samtools index ${pair_id}.consensus.bam
samtools fastq -1 ${pair_id}.consensus.R1.fastq -2 ${pair_id}.consensus.R2.fastq ${pair_id}.consensus.bam
gzip ${pair_id}.consensus.R1.fastq
gzip ${pair_id}.consensus.R2.fastq
bwa mem -M -C -t 2 -R '@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}' /project/shared/bicf_workflow_ref/GRCh38/genome.fa ${pair_id}.consensus.R1.fastq.gz ${pair_id}.consensus.R2.fastq.gz | samtools view -1 - > ${pair_id}.consensus.bam
samtools sort --threads 10 -o ${pair_id}.dedup.bam ${pair_id}.consensus.bam
else
cp ${sbam} ${prefix}.dedup.bam
fi
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment