From d50d6b99fca5402335bc4722cfa1741bd08509f9 Mon Sep 17 00:00:00 2001 From: Venkat Malladi <Venkat.Malladi@utsouthwestern.edu> Date: Tue, 16 Aug 2016 14:09:56 -0500 Subject: [PATCH] Call peaks for H3K4me3. --- call-peaks-macs-single.sh | 169 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 call-peaks-macs-single.sh diff --git a/call-peaks-macs-single.sh b/call-peaks-macs-single.sh new file mode 100644 index 0000000..6e78c91 --- /dev/null +++ b/call-peaks-macs-single.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# call-peaks-macs.sh + +script_name="call-peaks-macs2.sh" +script_ver="2.1.0" + +#Help function +usage() { + echo "-h --Help documentation for $script_name" + echo "-f --File path to experiment tagAlign files." + echo "-s --File path to control tagAlign files." + echo "-x --File path to experiment cross-correlation scores." + echo "-r --UCSC Reference genome (e.g. hg19, mm10)" + echo "-o --Path to output directory" + echo "-v --Version of script" + echo "Example: $script_name -f 'foo1.tagAlign.gz' -s 'con1.tagAlign.gz' -x 'foo1.cc' -r 'hg19' [-o '/path/to/output/dir/']" + exit 1 +} + +# Version function +version(){ + echo "$script_name $script_ver" + exit 1 +} + + +# Peak calling function +call_peak() { + # Establish variables + + experiment=$1 + control=$2 + xcor_scores_input=$3 + chrom_sizes=$4 + genomesize=$5 + out_dir=$6 + + #Extract the fragment length estimate from cross-correlation scores file + fraglen=`cat $xcor_scores_input | grep "predicted" | cut -f6 -d ' '` + echo $fraglen + + # Generate narrow peaks and preliminary signal tracks + base_fn=$(basename "${experiment}") + prefix=${base_fn%.tagAlign.gz} + macs2 callpeak -t $experiment -c $control -f BED -n $out_dir/$prefix -g $genomesize -p 1e-5 --nomodel --shift 0 --extsize $fraglen --keep-dup all -B --SPMR + + # Generate fold enrichment signal tracks + macs2 bdgcmp -t $out_dir/$prefix\_treat_pileup.bdg -c $out_dir/$prefix\_control_lambda.bdg --outdir $out_dir -o $prefix\_FE.bdg -m FE + + # Genearte bigWigs from bedgraph to support vizualization + bedtools slop -i $out_dir/$prefix\_FE.bdg -g /$chrom_sizes -b 0 | bedClip stdin $chrom_sizes $out_dir/$prefix.fc.signal.bedgraph + bedSort $out_dir/$prefix.fc.signal.bedgraph $out_dir/$prefix.sorted.fc.signal.bedgraph + bedGraphToBigWig $out_dir/$prefix.sorted.fc.signal.bedgraph $chrom_sizes $out_dir/$prefix.fc_signal.bw + rm $out_dir/$prefix.fc.signal.bedgraph + rm $out_dir/$prefix.sorted.fc.signal.bedgraph + + # Generate fold enrichment signal tracks + + # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000 + experiment_reads=`gzip -dc $experiment | wc -l | cut -f1` + control_reads=`gzip -dc $control | wc -l | cut -f1` + if [ $experiment_reads -ge $control_reads ]; then + min=$control_reads + else + min=$experiment_reads + fi + sval=`echo $min/1000000 | bc -l` + + macs2 bdgcmp -t $out_dir/$prefix\_treat_pileup.bdg -c $out_dir/$prefix\_control_lambda.bdg --outdir $out_dir -o $prefix\_ppois.bdg -m ppois -S $sval + + # Genearte bigWigs from bedgraph to support vizualization + bedtools slop -i $out_dir/$prefix\_ppois.bdg -g $chrom_sizes -b 0 | bedClip stdin $chrom_sizes $out_dir/$prefix.pval.signal.bedgraph + bedSort $out_dir/$prefix.pval.signal.bedgraph $out_dir/$prefix.sorted.pval.signal.bedgraph + bedGraphToBigWig $out_dir/$prefix.sorted.pval.signal.bedgraph $chrom_sizes $out_dir/$prefix.pval_signal.bw + rm $out_dir/$prefix.pval.signal.bedgraph + rm $out_dir/$prefix.sorted.pval.signal.bedgraph +} + +main(){ + + # Load required modules + module load python/2.7.x-anaconda + module load R/3.1.0-intel + module load macs/2.1.0-20151222 + module load gcc/4.8.1 + module load bedtools/2.17.0 + module load UCSC_userApps/v317 + + # Parsing options + OPTIND=1 # Reset OPTIND + while getopts :f:s:x:r:o:hv opt + do + case $opt in + f) aln=$OPTARG;; + s) control=$OPTARG;; + x) aln_cross=$OPTARG;; + r) ucsc_reference=$OPTARG;; + o) out=$OPTARG;; + h) usage;; + v) version;; + esac + done + + shift $(($OPTIND -1)) + + # Check for mandatory options + if [[ -z $aln ]] || [[ -z $control ]] || [[ -z $ucsc_reference ]] || [[ -z $aln_cross ]]; then + usage + fi + + # Check if length of arguments in aln1, aln2 and exp are the same + array_aln=(${aln//[,| ]/ }) + array_control=(${control//[,| ]/ }) + array_aln_cross=(${aln_cross//[,| ]/ }) + # Define the genome size to use + if [ $ucsc_reference = 'hg19' ]; then + genome_size='hs' + elif [ $ucsc_reference = 'mm10' ]; then + genome_size='mm' + elif [ $ucsc_reference = 'mm9' ]; then + genome_size='mm' + else + usage + fi + + # Define the output directory, if none defined make the location relative to first file + if [ -z $out ]; then + one_parent=$(dirname "${aln}") + out_dir=$(dirname "${one_parent}")\/$script_name-$script_ver/ + else + out_dir=$out\/$script_name-$script_ver + fi + + if [ ! -d $out_dir ]; then + mkdir $out_dir + fi + + # Align if file doesn't exist + if [ ! -e $out_dir/metadata.json ]; then + + # split files + rep1=${aln} + con1=${control} + rep1_xcor=${aln_cross} + + echo "* Downloading chrom.sizes..." + fetchChromSizes $ucsc_reference > $out_dir/chrom.sizes + chrom_sizes=$out_dir/chrom.sizes + + # Call peaks Replicates 1 and 2 + call_peak $rep1 $con1 $rep1_xcor $chrom_sizes $genome_size $out_dir + + # Remove chrom sizes + rm $out_dir/chrom.sizes + + # Get input and output files and then print out metadata.json file + input_files=("${array_aln[@]}" "${array_control[@]}" "${array_aln_cross[@]}" "$ucsc_reference") + printf -v input "\"%s\"," "${input_files[@]}" + input=${input%,} + output_file=($out_dir\/*) + printf -v output "\"%s\"," "${output_file[@]}" + output=${output%,} + printf '{"script name":"%s","script version":"%s", "input files": [%s], "output files": [%s]}' "$script_name" "$script_ver" "$input" "$output" | python -m json.tool > $out_dir/metadata.json + else + echo "* Peaks have been called" + fi +} + +main "$@" -- GitLab