Skip to content
Snippets Groups Projects
call-peaks-macs-single.sh 5.7 KiB
Newer Older
Venkat Malladi's avatar
Venkat Malladi committed
#!/bin/bash
# call-peaks-macs.sh

script_name="call-peaks-macs2.sh"
script_ver="2.1.0"

#Help function
usage() {
  echo "-h  --Help documentation for $script_name"
  echo "-f  --File path to experiment tagAlign files."
  echo "-s  --File path to control tagAlign files."
  echo "-x  --File path to experiment cross-correlation scores."
  echo "-r  --UCSC Reference genome (e.g. hg19, mm10)"
  echo "-o  --Path to output directory"
  echo "-v  --Version of script"
  echo "Example: $script_name -f 'foo1.tagAlign.gz' -s 'con1.tagAlign.gz' -x 'foo1.cc' -r 'hg19' [-o '/path/to/output/dir/']"
  exit 1
}

# Version function
version(){
    echo "$script_name $script_ver"
    exit 1
}


# Peak calling function
call_peak() {
    # Establish variables

    experiment=$1
    control=$2
    xcor_scores_input=$3
    chrom_sizes=$4
    genomesize=$5
    out_dir=$6

    #Extract the fragment length estimate from cross-correlation scores file
    fraglen=`cat $xcor_scores_input |  grep "predicted" | cut -f6 -d ' '`
    echo $fraglen

    # Generate narrow peaks and preliminary signal tracks
    base_fn=$(basename "${experiment}")
    prefix=${base_fn%.tagAlign.gz}
    macs2 callpeak -t $experiment -c $control -f BED -n $out_dir/$prefix -g $genomesize -p 1e-5 --nomodel --shift 0 --extsize $fraglen --keep-dup all -B --SPMR

    # Generate fold enrichment signal tracks
    macs2 bdgcmp -t $out_dir/$prefix\_treat_pileup.bdg -c $out_dir/$prefix\_control_lambda.bdg --outdir $out_dir -o $prefix\_FE.bdg -m FE

    # Genearte bigWigs from bedgraph to support vizualization
    bedtools slop -i $out_dir/$prefix\_FE.bdg -g /$chrom_sizes -b 0 | bedClip stdin $chrom_sizes $out_dir/$prefix.fc.signal.bedgraph
    bedSort $out_dir/$prefix.fc.signal.bedgraph $out_dir/$prefix.sorted.fc.signal.bedgraph
    bedGraphToBigWig $out_dir/$prefix.sorted.fc.signal.bedgraph $chrom_sizes $out_dir/$prefix.fc_signal.bw
    rm $out_dir/$prefix.fc.signal.bedgraph
    rm $out_dir/$prefix.sorted.fc.signal.bedgraph

    # Generate fold enrichment signal tracks

    # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000
    experiment_reads=`gzip -dc $experiment | wc -l | cut -f1`
    control_reads=`gzip -dc $control | wc -l | cut -f1`
    if [ $experiment_reads -ge $control_reads ]; then
        min=$control_reads
    else
        min=$experiment_reads
    fi
    sval=`echo $min/1000000 | bc -l`

    macs2 bdgcmp -t $out_dir/$prefix\_treat_pileup.bdg -c $out_dir/$prefix\_control_lambda.bdg --outdir $out_dir -o $prefix\_ppois.bdg -m ppois -S $sval

    # Genearte bigWigs from bedgraph to support vizualization
    bedtools slop -i $out_dir/$prefix\_ppois.bdg -g $chrom_sizes -b 0 | bedClip stdin $chrom_sizes $out_dir/$prefix.pval.signal.bedgraph
    bedSort $out_dir/$prefix.pval.signal.bedgraph $out_dir/$prefix.sorted.pval.signal.bedgraph
    bedGraphToBigWig $out_dir/$prefix.sorted.pval.signal.bedgraph $chrom_sizes $out_dir/$prefix.pval_signal.bw
    rm $out_dir/$prefix.pval.signal.bedgraph
    rm $out_dir/$prefix.sorted.pval.signal.bedgraph
}

main(){

    # Load required modules
    module load python/2.7.x-anaconda
    module load R/3.1.0-intel
    module load macs/2.1.0-20151222
    module load gcc/4.8.1
    module load bedtools/2.17.0
    module load UCSC_userApps/v317

    # Parsing options
    OPTIND=1 # Reset OPTIND
    while getopts :f:s:x:r:o:hv opt
        do
            case $opt in
                f) aln=$OPTARG;;
                s) control=$OPTARG;;
                x) aln_cross=$OPTARG;;
                r) ucsc_reference=$OPTARG;;
                o) out=$OPTARG;;
                h) usage;;
                v) version;;
            esac
        done

    shift $(($OPTIND -1))

    # Check for mandatory options
    if [[ -z $aln ]] || [[ -z $control ]] || [[ -z $ucsc_reference ]] || [[ -z $aln_cross ]]; then
        usage
    fi

    # Check if length of arguments in aln1, aln2 and exp are the same
    array_aln=(${aln//[,| ]/ })
    array_control=(${control//[,| ]/ })
    array_aln_cross=(${aln_cross//[,| ]/ })
    # Define the genome size to use
    if [ $ucsc_reference = 'hg19' ]; then
        genome_size='hs'
    elif [ $ucsc_reference = 'mm10' ]; then
        genome_size='mm'
    elif [ $ucsc_reference = 'mm9' ]; then
        genome_size='mm'
    else
        usage
    fi

    # Define the output directory, if none defined make the location relative to first file
    if [ -z $out ]; then
        one_parent=$(dirname "${aln}")
        out_dir=$(dirname "${one_parent}")\/$script_name-$script_ver/
    else
        out_dir=$out\/$script_name-$script_ver
    fi

    if [ ! -d $out_dir ]; then
        mkdir $out_dir
    fi

    # Align if file doesn't exist
    if [ ! -e $out_dir/metadata.json ]; then

        # split files
        rep1=${aln}
        con1=${control}
        rep1_xcor=${aln_cross}

        echo "* Downloading chrom.sizes..."
        fetchChromSizes $ucsc_reference > $out_dir/chrom.sizes
        chrom_sizes=$out_dir/chrom.sizes

        # Call peaks Replicates 1 and 2
        call_peak $rep1 $con1 $rep1_xcor $chrom_sizes $genome_size $out_dir

        # Remove chrom sizes
        rm $out_dir/chrom.sizes

        # Get input and output files and then print out metadata.json file
        input_files=("${array_aln[@]}" "${array_control[@]}" "${array_aln_cross[@]}" "$ucsc_reference")
        printf -v input "\"%s\"," "${input_files[@]}"
        input=${input%,}
        output_file=($out_dir\/*)
        printf -v output "\"%s\"," "${output_file[@]}"
        output=${output%,}
        printf '{"script name":"%s","script version":"%s", "input files": [%s], "output files": [%s]}' "$script_name" "$script_ver" "$input" "$output"  | python -m json.tool > $out_dir/metadata.json
    else
        echo "* Peaks have been called"
    fi
}

main "$@"