diff options
author | Clark Williams <williams@redhat.com> | 2017-01-09 20:48:02 -0600 |
---|---|---|
committer | Clark Williams <williams@redhat.com> | 2017-03-15 14:16:51 -0500 |
commit | 6e4d54b8c6d3697efba0e3d44727b10485a07381 (patch) | |
tree | b73d67129b56d407b05e3e418f5d0c1a497ef1e6 | |
parent | c98ab9e517208018099f9097080d4e703ab75c7e (diff) | |
download | rteval-6e4d54b8c6d3697efba0e3d44727b10485a07381.tar.gz |
kcompile.py: modify to manage jobs on per-node basis
This modification to kcompile takes into account NUMA topology
rather than treating the system as a big SMP box. Make loads are
started on a per-node basis (e.g. on a four node system there would be
four makes started), and the make commands are restricted to the cpu
cores available on that node.
This is an attempt to reduce cross-node traffic and to better mimic
a well-behaved realtime application.
Signed-off-by: Clark Williams <williams@redhat.com>
-rw-r--r-- | rteval/modules/loads/kcompile.py | 161 | ||||
-rw-r--r-- | rteval/rtevalConfig.py | 5 | ||||
-rw-r--r-- | rteval/systopology.py (renamed from rteval/sysinfo/systopology.py) | 3 |
3 files changed, 109 insertions, 60 deletions
diff --git a/rteval/modules/loads/kcompile.py b/rteval/modules/loads/kcompile.py index 1eb2cde..ef636c2 100644 --- a/rteval/modules/loads/kcompile.py +++ b/rteval/modules/loads/kcompile.py @@ -1,6 +1,7 @@ # # Copyright 2009 - 2013 Clark Williams <williams@redhat.com> # Copyright 2012 - 2013 David Sommerseth <davids@redhat.com> +# Copyright 2014 - 2017 Clark Williams <williams@redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -23,19 +24,89 @@ # are deemed to be part of the source code. # -import sys, os, glob, subprocess +import sys, os, os.path, glob, subprocess from signal import SIGTERM from rteval.modules import rtevalRuntimeError from rteval.modules.loads import CommandLineLoad from rteval.Log import Log from rteval.misc import expand_cpulist +from rteval.systopology import SysTopology + +kernel_prefix="linux-4.9" + +class KBuildJob(object): + '''Class to manage a build job bound to a particular node''' + + def __init__(self, node, kdir, logger=None): + self.kdir = kdir + self.jobid = None + self.node = node + self.logger = logger + self.builddir = os.path.dirname(kdir) + self.objdir = "%s/node%d" % (self.builddir, int(node)) + if not os.path.isdir(self.objdir): + os.mkdir(self.objdir) + if os.path.exists('/usr/bin/numactl'): + self.binder = 'numactl --cpunodebind %d' % int(self.node) + else: + self.binder = 'taskset -c %s' % str(self.node) + self.jobs = self.calc_jobs_per_cpu() * len(self.node) + self.log(Log.DEBUG, "node %d: jobs == %d" % (int(node), self.jobs)) + self.runcmd = "%s make O=%s -C %s -j%d bzImage modules" % (self.binder, self.objdir, self.kdir, self.jobs) + self.cleancmd = "%s make O=%s -C %s clean allmodconfig" % (self.binder, self.objdir, self.kdir) + self.log(Log.DEBUG, "node%d kcompile command: %s" % (int(node), self.runcmd)) + + def __str__(self): + return self.runcmd + + def log(self, logtype, msg): + if self.logger: + self.logger.log(logtype, "[kcompile node%d] %s" % (int(self.node), msg)) + + def calc_jobs_per_cpu(self): + mult = 2 + self.log(Log.DEBUG, "calulating jobs for node %d" % int(self.node)) + # get memory total in gigabytes + mem = int(self.node.meminfo['MemTotal']) / 1024.0 / 1024.0 / 1024.0 + # ratio of gigabytes to #cores + ratio = float(mem) / float(len(self.node)) + if ratio < 1.0: + ratio = 1.0 + if ratio < 1.0 or ratio > 2.0: + mult = 1 + self.log(Log.DEBUG, "memory/cores ratio on node %d: %f" % (int(self.node), ratio)) + self.log(Log.DEBUG, "returning jobs/core value of: %d" % int(ratio) * mult) + return int(int(ratio) * int(mult)) + + def clean(self, sin=None, sout=None, serr=None): + self.log(Log.DEBUG, "cleaning objdir %s" % self.objdir) + subprocess.call(self.cleancmd, shell=True, + stdin=sin, stdout=sout, stderr=serr) + + def run(self, sin=None, sout=None, serr=None): + self.log(Log.INFO, "starting workload on node %d" % int(self.node)) + self.log(Log.DEBUG, "running on node %d: %s" % (int(self.node), self.runcmd)) + self.jobid = subprocess.Popen(self.runcmd, shell=True, + stdin=sin, stdout=sout, stderr=serr) + + def isrunning(self): + if self.jobid == None: + return False + return (self.jobid.poll() == None) + + def stop(self): + if not self.jobid: + return True + return self.jobid.terminate() -kernel_prefix="linux-2.6" class Kcompile(CommandLineLoad): def __init__(self, config, logger): + self.buildjobs = {} + self.config = config + self.topology = SysTopology() CommandLineLoad.__init__(self, "kcompile", config, logger) - + self.logger = logger def _WorkloadSetup(self): # find our source tarball @@ -80,13 +151,18 @@ class Kcompile(CommandLineLoad): break if kdir == None: raise rtevalRuntimeError(self, "Can't find kernel directory!") - self.jobs = 1 # We only run one instance of the kcompile job self.mydir = os.path.join(self.builddir, kdir) self._log(Log.DEBUG, "mydir = %s" % self.mydir) + self._log(Log.DEBUG, "systopology: %s" % self.topology) + self.jobs = len(self.topology) + self.args = [] + for n in self.topology: + self._log(Log.DEBUG, "Configuring build job for node %d" % int(n)) + self.buildjobs[n] = KBuildJob(n, self.mydir, self.logger) + self.args.append(str(self.buildjobs[n])+";") def _WorkloadBuild(self): - self._log(Log.DEBUG, "setting up all module config file in %s" % self.mydir) null = os.open("/dev/null", os.O_RDWR) if self._logging: out = self.open_logfile("kcompile-build.stdout") @@ -94,9 +170,9 @@ class Kcompile(CommandLineLoad): else: out = err = null - # clean up from potential previous run + # clean up any damage from previous runs try: - ret = subprocess.call(["make", "-C", self.mydir, "mrproper", "allmodconfig"], + ret = subprocess.call(["make", "-C", self.mydir, "mrproper"], stdin=null, stdout=out, stderr=err) if ret: raise rtevalRuntimeError(self, "kcompile setup failed: %d" % ret) @@ -104,31 +180,15 @@ class Kcompile(CommandLineLoad): self._log(Log.DEBUG, "keyboard interrupt, aborting") return self._log(Log.DEBUG, "ready to run") - os.close(null) if self._logging: os.close(out) os.close(err) + # clean up object dirs and make sure each has a config file + for n in self.topology: + self.buildjobs[n].clean(sin=null,sout=null,serr=null) + os.close(null) self._setReady() - - def __calc_numjobs(self): - mult = int(self._cfg.setdefault('jobspercore', 1)) - mem = self.memsize[0] - if self.memsize[1] == 'KB': - mem = mem / (1024.0 * 1024.0) - elif self.memsize[1] == 'MB': - mem = mem / 1024.0 - elif self.memsize[1] == 'TB': - mem = mem * 1024 - ratio = float(mem) / float(self.num_cpus) - if ratio > 1.0: - njobs = self.num_cpus * mult - else: - self._log(Log.DEBUG, "Low memory system (%f GB/core)! Dropping jobs to one per core" % ratio) - njobs = self.num_cpus - return njobs - - def _WorkloadPrepare(self): self.__nullfd = os.open("/dev/null", os.O_RDWR) if self._logging: @@ -143,41 +203,30 @@ class Kcompile(CommandLineLoad): else: cpulist = "" - self.jobs = self.__calc_numjobs() - self._log(Log.DEBUG, "starting loop (jobs: %d)" % self.jobs) - - self.args = ["make", "-C", self.mydir, - "-j%d" % self.jobs ] - - if cpulist: - self.args = ["taskset", '-c', cpulist] + self.args - - self.__kcompileproc = None - - def _WorkloadTask(self): - if not self.__kcompileproc or self.__kcompileproc.poll() is not None: - # If kcompile has not been kicked off yet, or have completed, - # restart it - self._log(Log.DEBUG, "Kicking off kcompile: %s" % " ".join(self.args)) - self.__kcompileproc = subprocess.Popen(self.args, - stdin=self.__nullfd, - stdout=self.__outfd, - stderr=self.__errfd) - + for n in self.topology: + if not self.buildjobs[n]: + raise RuntimeError, "Build job not set up for node %d" % int(n) + if self.buildjobs[n].jobid is None or self.buildjobs[n].jobid.poll() is not None: + self._log(Log.INFO, "Starting load on node %d" % n) + self.buildjobs[n].run(self.__nullfd, self.__outfd, self.__errfd) def WorkloadAlive(self): - # Let _WorkloadTask() kick off new runs, if it stops - thus - # kcompile will always be alive + # if any of the jobs has stopped, return False + for n in self.topology: + if self.buildjobs[n].jobid.poll() is not None: + return False return True def _WorkloadCleanup(self): self._log(Log.DEBUG, "out of stopevent loop") - if self.__kcompileproc.poll() == None: - self._log(Log.DEBUG, "killing compile job with SIGTERM") - os.kill(self.__kcompileproc.pid, SIGTERM) - self.__kcompileproc.wait() + for n in self.buildjobs: + if self.buildjobs[n].jobid.poll() == None: + self._log(Log.DEBUG, "stopping job on node %d" % int(n)) + self.buildjobs[n].jobid.terminate() + self.buildjobs[n].jobid.wait() + del self.buildjobs[n].jobid os.close(self.__nullfd) del self.__nullfd if self._logging: @@ -185,14 +234,12 @@ class Kcompile(CommandLineLoad): del self.__outfd os.close(self.__errfd) del self.__errfd - del self.__kcompileproc self._setFinished() - def ModuleParameters(): return {"source": {"descr": "Source tar ball", - "default": "linux-2.6.21.tar.bz2", + "default": "linux-4.9.tar.xz", "metavar": "TARBALL"}, "jobspercore": {"descr": "Number of working threads per core", "default": 2, diff --git a/rteval/rtevalConfig.py b/rteval/rtevalConfig.py index 47488a0..16ac2d0 100644 --- a/rteval/rtevalConfig.py +++ b/rteval/rtevalConfig.py @@ -33,7 +33,7 @@ import os, sys import ConfigParser from Log import Log -from sysinfo.systopology import SysTopology +from systopology import SysTopology def get_user_name(): name = os.getenv('SUDO_USER') @@ -190,8 +190,7 @@ class rtevalConfig(object): # get our system topology info self.__systopology = SysTopology() - # debug - print self.__systopology + print("got system topology: %s" % self.__systopology) # Import the default config first for sect, vals in default_config.items(): diff --git a/rteval/sysinfo/systopology.py b/rteval/systopology.py index 0295276..7c0985e 100644 --- a/rteval/sysinfo/systopology.py +++ b/rteval/systopology.py @@ -147,6 +147,9 @@ class NumaNode(object): def __str__(self): return self.getcpustr() + def __int__(self): + return self.nodeid + # read info about memory attached to this node def getmeminfo(self): self.meminfo = {} |