Source code for compass.parallel

import multiprocessing
import os
import subprocess
import warnings

import numpy as np
from mpas_tools.logging import check_call


[docs] def get_available_parallel_resources(config): """ Get the number of total cores and nodes available for running steps Parameters ---------- config : compass.config.CompassConfigParser Configuration options for the test case Returns ------- available_resources : dict A dictionary containing available resources (cores, tasks, nodes and cores_per_node) """ parallel_system = config.get('parallel', 'system') if parallel_system == 'slurm' and 'SLURM_JOB_ID' not in os.environ: parallel_system = 'login' if parallel_system == 'slurm': job_id = os.environ['SLURM_JOB_ID'] node = os.environ['SLURMD_NODENAME'] args = ['sinfo', '--noheader', '--node', node, '-o', '%X'] sockets_per_node = _get_subprocess_int(args) args = ['sinfo', '--noheader', '--node', node, '-o', '%Y'] cores_per_socket = _get_subprocess_int(args) if config.has_option('parallel', 'threads_per_core'): threads_per_core = config.getint('parallel', 'threads_per_core') else: args = ['sinfo', '--noheader', '--node', node, '-o', '%Z'] threads_per_core = _get_subprocess_int(args) cores_per_node = sockets_per_node * cores_per_socket * threads_per_core args = ['squeue', '--noheader', '-j', job_id, '-o', '%D'] nodes = _get_subprocess_int(args) cores = cores_per_node * nodes mpi_allowed = True elif parallel_system == 'login': cores = min(multiprocessing.cpu_count(), config.getint('parallel', 'login_cores')) cores_per_node = cores nodes = 1 mpi_allowed = False elif parallel_system == 'single_node': cores = multiprocessing.cpu_count() if config.has_option('parallel', 'cores_per_node'): cores = min(cores, config.getint('parallel', 'cores_per_node')) cores_per_node = cores nodes = 1 mpi_allowed = True else: raise ValueError(f'Unexpected parallel system: {parallel_system}') available_resources = dict( cores=cores, nodes=nodes, cores_per_node=cores_per_node, mpi_allowed=mpi_allowed ) if config.has_option('parallel', 'gpus_per_node'): available_resources['gpus_per_node'] = \ config.getint('parallel', 'gpus_per_node') return available_resources
[docs] def set_cores_per_node(config, cores_per_node): """ If the system has Slurm, find out the ``cpus_per_node`` and set the config option accordingly. """ parallel_system = config.get('parallel', 'system') if parallel_system == 'slurm': old_cores_per_node = config.getint('parallel', 'cores_per_node') config.set('parallel', 'cores_per_node', f'{cores_per_node}') if old_cores_per_node != cores_per_node: warnings.warn(f'Slurm found {cores_per_node} cpus per node but ' f'config from mache was {old_cores_per_node}') elif parallel_system == 'single_node': if not config.has_option('parallel', 'cores_per_node'): config.set('parallel', 'cores_per_node', f'{cores_per_node}')
[docs] def run_command(args, cpus_per_task, ntasks, openmp_threads, config, logger): """ Run a subprocess with the given command-line arguments and resources Parameters ---------- args : list of str The command-line arguments to run in parallel cpus_per_task : int the number of cores per task the process would ideally use. If fewer cores per node are available on the system, the substep will run on all available cores as long as this is not below ``min_cpus_per_task`` ntasks : int the number of tasks the process would ideally use. If too few cores are available on the system to accommodate the number of tasks and the number of cores per task, the substep will run on fewer tasks as long as as this is not below ``min_tasks`` openmp_threads : int the number of OpenMP threads to use config : configparser.ConfigParser Configuration options for the test case logger : logging.Logger A logger for output from the step """ env = dict(os.environ) env['OMP_NUM_THREADS'] = f'{openmp_threads}' if openmp_threads > 1: logger.info(f'Running with {openmp_threads} OpenMP threads') parallel_executable = config.get('parallel', 'parallel_executable') # split the parallel executable into constituents in case it includes flags command_line_args = parallel_executable.split(' ') parallel_system = config.get('parallel', 'system') if parallel_system == 'slurm': cores = ntasks * cpus_per_task cores_per_node = config.getint('parallel', 'cores_per_node') nodes = int(np.ceil(cores / cores_per_node)) command_line_args.extend(['-c', f'{cpus_per_task}', '-N', f'{nodes}', '-n', f'{ntasks}']) elif parallel_system == 'single_node': command_line_args.extend(['-n', f'{ntasks}']) else: raise ValueError(f'Unexpected parallel system: {parallel_system}') command_line_args.extend(args) check_call(command_line_args, logger, env=env)
def _get_subprocess_int(args): value = subprocess.check_output(args) value = int(value.decode('utf-8').strip('\n')) return value