diff --git a/workflow/main.nf b/workflow/main.nf index baf38e9bc499c1a86045084e21fdee3e6f8c3814..e042ad2a473176d3e2a700c256857a1e2c47cd43 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -14,6 +14,17 @@ params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ? params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.cutoffRatio = 1.2 +params.topPeakCount = 600 + +// Define regular variables +pairedEnd = params.pairedEnd +designFile = params.designFile +genomeSize = params.genomeSize +genome = params.genome +chromSizes = params.chromSizes +fasta = params.fasta +cutoffRatio = params.cutoffRatio +topPeakCount = params.topPeakCount // Check inputs if( params.bwaIndex ){ @@ -31,15 +42,8 @@ readsList = Channel .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")} .collectFile( name: 'fileList.tsv', newLine: true ) -// Define regular variables -pairedEnd = params.pairedEnd -designFile = params.designFile -genomeSize = params.genomeSize -genome = params.genome -chromSizes = params.chromSizes -fasta = params.fasta -cutoffRatio = params.cutoffRatio +// Check design file for errors process checkDesignFile { publishDir "$baseDir/output/design", mode: 'copy' @@ -458,6 +462,6 @@ process motifSearch { script: """ - python3 $baseDir/scripts/motif_search.py -i $designMotifSearch -g $fasta + python3 $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount """ } diff --git a/workflow/scripts/motif_search.py b/workflow/scripts/motif_search.py index 5cf27851e02e964f894fc304a29cfb1cfcef20cd..9feac8c39201af96700f69370a70b24a7111a5fc 100644 --- a/workflow/scripts/motif_search.py +++ b/workflow/scripts/motif_search.py @@ -2,7 +2,7 @@ '''Call Motifs on called peaks.''' - +import os import sys import re from re import sub @@ -42,6 +42,10 @@ def get_args(): help="The genome FASTA file.", required=True) + parser.add_argument('-p', '--peak', + help="The number of peaks to use.", + required=True) + args = parser.parse_args() return args @@ -50,33 +54,39 @@ def get_args(): def run_wrapper(args): motif_search(*args) -def motif_search(filename, genome, experiment): - sorted_fn = 'sorted-%s' % (filename) +def motif_search(filename, genome, experiment, peak): + + file_basename = os.path.basename(filename) + sorted_fn = 'sorted-%s' % (file_basename) out_fa = '%s.fa' % (experiment) out_motif = '%s_memechip' % (experiment) - # Sort Bed file by - out, err = run_pipe([ + # Sort Bed file and limit number of peaks + if peak == -1: + peak = utils.count_lines(filename) + + out, err = utils.run_pipe([ 'sort -k %dgr,%dgr %s' % (5, 5, filename), - sorted_fn) + 'head -n %s' % (peak)], outfile=sorted_fn) # Get fasta file out, err = utils.run_pipe([ - '"bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)]) + 'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)]) #Call memechip out, err = utils.run_pipe([ - '"meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -meme-norand' % (out_motif, out_fa)]) + 'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)]) def main(): args = get_args() design = args.design genome = args.genome + peak = args.peak # Read files design_df = pd.read_csv(design, sep='\t') - meme_arglist = zip(design_df['Peaks'].tolist(),[genome]*design_df.shape[0],design_df['Experiment'].tolist()) + meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0]) work_pool = Pool(min(12,design_df.shape[0])) return_list = work_pool.map(run_wrapper, meme_arglist) work_pool.close()