Commit f3a77aea authored by David Trudgian's avatar David Trudgian
Browse files

Working srun spearmint and gridsearch

parent ba36eb89
......@@ -27,7 +27,7 @@ def rosenbrock(x, y):
def main():
arguments = docopt(__doc__)
time.sleep(random.random())
time.sleep(random.random() * 10)
print rosenbrock(float(arguments['--x']), float(arguments['--y']))
......
......@@ -23,7 +23,6 @@ class LocalExecutor(BaseExecutor):
cpus_per_task = params.vals['cpus_per_task']
self.conc_tasks = (cpus / cpus_per_task)
logger.info(" - Will run %d concurrent local tasks" % self.conc_tasks)
logger.debug(" - Initializing parallel pool")
super(LocalExecutor, self).__init__(cwd, params)
def run(self, task_index, cmd):
......
......@@ -42,18 +42,12 @@ class SrunExecutor(BaseExecutor):
super(SrunExecutor, self).__init__(cwd, params)
def run(self, commands):
"""Run provided commands, in parallel up to conc_tasks."""
p = Pool(self.conc_tasks)
for cmd in commands:
p.apply_async(self.wrap_cmd, [self.task_count, cmd], callback=self.write_trace)
self.task_count = self.task_count + 1
p.close()
p.join()
def run(self, task_index, cmd):
"""Run provided commands, in parallel up to conc_tasks."""
logger.debug("LocalExecutor is handling command: %s", cmd)
return self.wrap_cmd(task_index, cmd)
return self.task_summaries[self.task_count - 1]
def _run_cmd(self, cmd, stderr_file, stdout_file):
"""SLURM execution of a command using srun.
......@@ -72,7 +66,7 @@ class SrunExecutor(BaseExecutor):
srun_cmd = "srun --exclusive -N1 -n1 --cpus-per-task=%d --distribution=cyclic %s" % (
cpus_per_task, cmd['__command']['value'])
logging.debug("srun command for task is: %s", srun_cmd)
logging.debug(" - srun command for task is: %s", srun_cmd)
ret = call(srun_cmd, shell=True, stderr=stderr_file,
stdout=stdout_file, env=os.environ)
......
......@@ -34,6 +34,14 @@ class SpearmintOptimizer(BaseOptimizer):
# increases overhead.
time.sleep(1)
pool.close()
pool.terminate()
pool.join()
logger.info("- Optimization completed after %d tasks run", self.task_index + 1)
logger.info("- Best score: %f\ttask %d", self.best_task['score'],
self.best_task['task_index'])
def attempt_dispatch(self, expt_config, pool, expt_dir, chooser):
expt = load_experiment(expt_config)
......@@ -57,9 +65,9 @@ class SpearmintOptimizer(BaseOptimizer):
(n_candidates, n_pending, n_complete))
# Track the time series of optimization.
if n_complete >= 1000:
logger.info(" - Spearmint: Maximum number of finished jobs (%d) reached."
"Exiting" % 1000)
if n_complete >= 10:
logger.info(" - Spearmint: Maximum number of finished jobs (%d) reached. "
"Exiting" % 10)
return False
if n_candidates == 0:
......@@ -96,21 +104,20 @@ class SpearmintOptimizer(BaseOptimizer):
locker = Locker()
locker.unlock(grid_for(job))
pool.apply_async(self.run_job, [expt_grid, job_id, self.task_index], callback=self.score_check)
sp_params = expt_grid.get_params(job_id)
pool.apply_async(self.run_job, [sp_params, job_id, self.task_index], callback=self.score_check)
expt_grid.set_submitted(job_id, job_id)
self.task_index = self.task_index + 1
return True
def run_job(self, expt_grid, job_id, task_index):
def run_job(self, sp_params, job_id, task_index):
try:
logger.info(" - Starting task %d (spearmint job %d)", task_index, job_id)
sp_params = expt_grid.get_params(job_id)
args = self.params.arg_template()
for p in sp_params:
args[p.name]['value'] = p.dbl_val[0]
......@@ -119,7 +126,7 @@ class SpearmintOptimizer(BaseOptimizer):
summary = self.executor.run(task_index, args)
expt_grid.set_complete(job_id, summary["_score"], 0)
ExperimentGrid.job_complete(self.executor.logdir, job_id, summary["_score"], 0)
logger.info(" - Finished task %d (spearmint job %d) score: %f", task_index, job_id, summary["_score"])
......
......@@ -52,10 +52,10 @@ class ExperimentGrid:
@staticmethod
def job_complete(expt_dir, id, value, duration):
log("setting job %d complete" % id)
# log("setting job %d complete" % id)
expt_grid = ExperimentGrid(expt_dir)
expt_grid.set_complete(id, value, duration)
log("set...")
# log("set...")
@staticmethod
def job_broken(expt_dir, id):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment