# Node ID 69918440479476a3be5064c74fca61ea35cea842 # Parent 32d1739cbfd76a7dd420ee4039ffeb23d7dcade8 diff --git a/configs/example/multi-system-apu.py b/configs/example/multi-system-apu.py new file mode 100644 --- /dev/null +++ b/configs/example/multi-system-apu.py @@ -0,0 +1,519 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Sooraj Puthoor +# Brandon Potter +# + +import glob +import math +import optparse +import os +import re + +import m5 +from m5.objects import * +from m5.util import addToPath, fatal + +addToPath("../ruby") +addToPath("../common") +addToPath("../topologies") + +import Options +import Ruby +import Simulation +import GPUTLBOptions +import GPUTLBConfig + + +# Search for the file or directory in the search path and return it. If the +# search is unsuccessful then fail out of configuration. +def find_path(base_list, rel_path, test): + for base in base_list: + if not base: + continue + full_path = os.path.join(base, rel_path) + if test(full_path): + return full_path + fatal("%s not found in %s" % (rel_path, base_list)) + + +# Special instance of the find_path function. Just wrap it with the +# condition. +def find_file(base_list, rel_path): + return find_path(base_list, rel_path, os.path.isfile) + + +# Set the option if the option is defined or fail out of the configuration. +def setOption(parser, opt_str, value=1): + if parser.has_option(opt_str): + opt = parser.get_option(opt_str) + exec("parser.values.%s = %s" % (opt.dest, value)) + return + raise Exception("cannot find %s in list of possible options" % opt_str) + + +# Get the option if the option is defined or fail out of the configuration. +def getOption(parser, opt_str): + if parser.has_option(opt_str): + opt = parser.get_option(opt_str) + exec("return_value = parser.values.%s" % opt.dest) + return return_value + raise Exception("cannot find %s in list of possible options" % opt_str) + + +gpu_map_vaddr = 0x10000000 +gpu_map_paddr = 0x200000000 +gpu_map_size = 4096 +class GPUProcess(LiveProcess): + # Every process that wants to access the GPU has to map the device + # into its address space. That can't happen until after the C++ + # object has been created through, so we have to do it via this + # callback hook. + def startup(self): + self.map(gpu_map_vaddr, gpu_map_paddr, gpu_map_size, False) + self._ccObject.startup() + + +# Script options. +parser = optparse.OptionParser() +Ruby.define_options(parser) +Options.addCommonOptions(parser) +Options.addSEOptions(parser) +GPUTLBOptions.tlb_options(parser) + +parser.add_option("--num-systems", type="int", default=1, + help="Number of copies of the system to create") +parser.add_option("-u", "--num-compute-units", type="int", default=1, + help="Number of GPU compute units"), +parser.add_option("--num-cp", type="int", default=0, + help="Number of GPU Command Processors (CP)") +parser.add_option("-k", "--kernel-files", + help="File(s) containing GPU kernels (colon separated)") +parser.add_option("--benchmark-root", + help="Root of benchmark directory tree") +parser.add_option("--cpu-only-mode", action="store_true", default=False, + help="APU mode. Used to take care of problems in " + "Ruby.py while running APU protocols") + +parser.add_option("--cu-per-sqc", type="int", default=4, + help="Number of CUs sharing an SQC (icache and i-TLB)") +parser.add_option("--simds-per-cu", type="int", default=4, + help="SIMD units per CU") +parser.add_option("--wf-size", type="int", default=64, + help="Wavefront size(in workitems)") +parser.add_option("--sp-bypass-path-length", type="int", default=4, + help="Number of stages of bypass path in vector ALU for " + "Single Precision ops") +parser.add_option("--dp-bypass-path-length", type="int", default=4, + help="Number of stages of bypass path in vector ALU for " + "Double Precision ops") + +parser.add_option("--issue-period", type="int", default=4, + help="Cycles per vector instruction issue period") +parser.add_option("--glbmem-wr-bus-width", type="int", default=32, + help="VGPR to Coalescer (Global Memory) data bus width " + "in bytes") +parser.add_option("--glbmem-rd-bus-width", type="int", default=32, + help="Coalescer to VGPR (Global Memory) data bus width " + "in bytes") +parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, + help="Number of Shared Memory pipelines per CU") +parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, + help="Number of Global Memory pipelines per CU") +parser.add_option("--wfs-per-simd", type="int", default=10, + help="Number of WF slots per SIMD") + +parser.add_option("--vreg-file-size", type="int", default=2048, + help="Number of physical vector registers per SIMD") +parser.add_option("--bw-scalor", type="int", default=0, + help="Bandwidth scalor for scalability analysis") +parser.add_option("--CPUClock", type="string", default="2GHz", + help="CPU clock") +parser.add_option("--GPUClock", type="string", default="1GHz", + help="GPU clock") +parser.add_option("--cpu-voltage", action="store", type="string", + default='1.0V', + help="CPU voltage domain") +parser.add_option("--gpu-voltage", action="store", type="string", + default='1.0V', + help="GPU voltage domain") +parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST", + help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)") +parser.add_option("--xact-cas-mode", action="store_true", + help="Enable load_compare mode (transactional CAS)") +parser.add_option("--SegFaultDebug",action="store_true", + help="Checks for GPU seg fault before TLB access") +parser.add_option("--FunctionalTLB",action="store_true", + help="Assumes TLB has no latency") +parser.add_option("--LocalMemBarrier",action="store_true", + help="Barrier does not wait for writethrough completion") +parser.add_option("--countPages", action="store_true", + help="Count page access and output in per-CU output file") +parser.add_option("--TLB-prefetch", type="int", + help="Prefetch depth for TLBs") +parser.add_option("--pf-type", type="string", + help="Type of prefetch: PF_CU, PF_WF, PF_PHASE, PF_STRIDE") +parser.add_option("--pf-stride", type="int", + help="Set prefetch stride") +parser.add_option("--numLdsBanks", type="int", default=32, + help="Number of physical banks per LDS module") +parser.add_option("--ldsBankConflictPenalty", type="int", default=1, + help="Number of cycles per LDS bank conflict") + + +# Parse the arguments and fail out of the configuration if some basic +# constraints are not met. The parser needs to successfully parse every +# argument and the simulation needs workloads to be specified. +(options, args) = parser.parse_args() +if args: + fatal("some arguments were not parsed by the option parser") +elif not options.cmd: + fatal("no workload(s) specified") + + +# GPU cache coherence protocols rely on the backing store to function. +setOption(parser, "--access-backing-store") + +# Currently the gpu model requires ruby +if buildEnv['PROTOCOL'] == 'None': + fatal("GPU model requires ruby") + +# Currently the gpu model requires only timing or detailed CPU +if not (options.cpu_type == "timing" or + options.cpu_type == "detailed"): + fatal("GPU model requires timing or detailed CPU") + + +benchmark_path = ['.'] +if options.benchmark_root: + benchmark_path = [options.benchmark_root] + +# Examine options and instantiate GPUProcess class for each workload. +def get_processes(options): + sys_workloads = options.cmd.split(":") + sys_inputs = options.input.split(":") if options.input else [] + sys_outputs = options.output.split(":") if options.output else [] + sys_errouts = options.errout.split(":") if options.errout else [] + sys_pargs = options.options.split(":") if options.options else [] + + sys_kernel_files = [find_file(benchmark_path, f) + for f in options.kernel_files.split(":")] + + sys_multiprocesses = [] + + for i in xrange(len(sys_workloads)): + workloads = sys_workloads[i].split(";") + inputs = sys_inputs[i].split(";") if sys_inputs else [] + outputs = sys_outputs[i].split(";") if sys_outputs else [] + errouts = sys_errouts[i].split(";") if sys_errouts else [] + process_args = sys_pargs[i].split(";") if sys_pargs else [] + + index = 0 + multiprocesses = [] + + for workload in workloads: + process = GPUProcess() + + process.cwd = os.getcwd() + process.executable = workload + process.cmd = [workload] + if len(process_args) > index: + process.cmd += process_args[index].split() + if len(inputs) > index: + process.input = inputs[index] + if len(outputs) > index: + process.output = outputs[index] + if len(errouts) > index: + process.errout = errouts[index] + multiprocesses.append(process) + index += 1 + + sys_multiprocesses.append(multiprocesses) + + num_threads = [1 for i in xrange(len(sys_workloads))] + + # Assumes that this will run without SMT. Need to work on the options + # framework to come up with a way to support multi-system configurations + # with multiple CPUs and SMT. + return sys_multiprocesses, sys_kernel_files, num_threads + + +multiprocesses, sys_kernel_files, num_threads = get_processes(options) +assert(len(multiprocesses) == options.num_systems) + +(CPUClass, test_mem_mode, FutureClass) = Simulation.setCPUClass(options) +cpu_class_list = [] +for s in xrange(options.num_systems): + cpu_class_list.append(Simulation.setCPUClass(options)[0]) + cpu_class_list[-1].numThreads = num_threads[s] + + +# This file can support multiple compute units +assert(options.num_compute_units >= 1) + +# Currently, the SQC (GPU's icache) is shared by multiple compute +# units (CUs). The protocol works just fine even if SQC is not shared. +# Overriding this option here so that the user need not explicitly set +# this (assuming sharing SQC is the common usage) +n_sqc = int(math.ceil(float(options.num_compute_units) / options.cu_per_sqc)) +options.num_sqc = n_sqc + +sys_list = [] + +for s in xrange(options.num_systems): + shader = Shader(n_wf=options.wfs_per_simd, + clk_domain=SrcClockDomain( + clock=options.GPUClock, + voltage_domain=VoltageDomain( + voltage=options.gpu_voltage))) + # GPU_RfO(Read For Ownership) implements SC/TSO memory model. + # Other GPU protocols implement release consistency at GPU side. + # So, all GPU protocols other than GPU_RfO should make their writes + # visible to the global memory and should read from global memory + # during kernal boundary. The pipeline initiates(or do not initiate) + # the acquire/release operation depending on this impl_kern_boundary_sync + # flag. This flag=true means pipeline initiates a acquire/release + # operation at kernel boundary. + if buildEnv["PROTOCOL"] == "GPU_RfO": + shader.impl_kern_boundary_sync = False + else: + shader.impl_kern_boundary_sync = True + + compute_units = [] + per_lane = True if options.TLB_config == "perLane" else False + for i in xrange(options.num_compute_units): + CU = ComputeUnit(cu_id=i, perLaneTLB=per_lane, + num_SIMDs=options.simds_per_cu, + wfSize=options.wf_size, + spbypass_pipe_length=options.sp_bypass_path_length, + dpbypass_pipe_length=options.dp_bypass_path_length, + coalescer_to_vrf_bus_width=\ + options.glbmem_rd_bus_width, + vrf_to_coalescer_bus_width=\ + options.glbmem_wr_bus_width, + num_global_mem_pipes=options.glb_mem_pipes_per_cu, + num_shared_mem_pipes=options.shr_mem_pipes_per_cu, + n_wf=options.wfs_per_simd, + execPolicy=options.CUExecPolicy, + xactCasMode=options.xact_cas_mode, + debugSegFault=options.SegFaultDebug, + functionalTLB=options.FunctionalTLB, + localMemBarrier=options.LocalMemBarrier, + countPages=options.countPages, + localDataStore=\ + LdsState(banks=options.numLdsBanks, + bankConflictPenalty=\ + options.ldsBankConflictPenalty)) + wavefronts = [] + vrfs = [] + for j in xrange(options.simds_per_cu): + for k in xrange(shader.n_wf): + wavefronts.append(Wavefront(simdId=j, wf_slot_id=k)) + vrfs.append(VectorRegisterFile(simd_id=j, + num_regs_per_simd=options.vreg_file_size)) + CU.wavefronts = wavefronts + CU.vector_register_file = vrfs + if options.TLB_prefetch: + CU.prefetch_depth = options.TLB_prefetch + CU.prefetch_prev_type = options.pf_type + + # Attach the LDS and the CU to the bus (which is actually a bridge). + CU.ldsPort = CU.ldsBus.slave + CU.ldsBus.master = CU.localDataStore.cuPort + compute_units.append(CU) + + # Attach compute units to GPU + shader.CUs = compute_units + + cpu_type = "AtomicSimple" + if options.cpu_type == "detailed": + cpu_type = "DerivO3" + if options.cpu_type == "timing": + cpu_type = "TimingSimple" + + # The GPU requires Ruby to work which implicitly requires a timing CPU. + assert(cpu_type != "AtomicSimple") + + # We only support timing mode for shader and memory + shader.timing = True + mem_mode = "timing" + + # Create the CPUs. + cpu_list = [] + for i in range(options.num_cpus): + exec("cpu = %sCPU(cpu_id=i, \ + clk_domain=SrcClockDomain( \ + clock=options.CPUClock, \ + voltage_domain=VoltageDomain( \ + voltage=options.cpu_voltage)))" + % cpu_type) + cpu_list.append(cpu) + + # Create the command processors. + cp_list = [] + for i in xrange(options.num_cp): + exec("cp = %sCPU(cpu_id=options.num_cpus + i, \ + clk_domain=SrcClockDomain( \ + clock=options.CPUClock, \ + voltage_domain=VoltageDomain( \ + voltage=options.cpu_voltage)))" % cpu_type) + cp_list.append(cp) + + host_cpu = cpu_list[0] + + dispatcher = GpuDispatcher() + + # The list ordering here has a special significance. + entire_list = cpu_list + [shader] + cp_list + [dispatcher] + shader_idx = options.num_cpus + cp_idx = shader_idx + 1 + + # Create the system object. + # Notice the CPU list is explicitly added as a parameter to system. + system = System(cpu=entire_list, + mem_mode=mem_mode, + mem_ranges=[AddrRange(options.mem_size)], + cache_line_size=options.cacheline_size) + + kernel_files = [find_file(benchmark_path, f) + for f in sys_kernel_files[s].split(";")] + cl_driver = ClDriver(filename="hsa", codefile=kernel_files) + + for i in xrange(options.num_cpus): + if i >= len(multiprocesses[s]): + system.cpu[i].workload = multiprocesses[s][0] + else: + # Choosing to clobber whatever was in multiprocesses[s][i] + # because we need to set values for it which we did not know + # when the GPUProcess was previously created. (It does not seem + # that these process entries can be changed in Python after + # they are set.) Also, the executables can be different, but we + # assume that they're all the same to simplify this code. + multiprocesses[s][i] = GPUProcess(executable=\ + multiprocesses[s][i].executable, + cmd=multiprocesses[s][i].cmd, + cwd=multiprocesses[s][i].cwd, + input=multiprocesses[s][i].input, + output=multiprocesses[s][i].output, + errout=multiprocesses[s][i].errout, + drivers=[cl_driver]) + if options.env: + with open(options.env, "r") as f: + multiprocesses[s][i].env = [line.rstrip() for line in f] + system.cpu[i].workload = multiprocesses[s][i] + + for cp in cp_list: + cp.workload = host_cpu.workload + + system.voltage_domain = VoltageDomain(voltage=options.sys_voltage) + system.clk_domain = SrcClockDomain(clock=options.sys_clock, + voltage_domain=system.voltage_domain) + system.cpu_voltage_domain = VoltageDomain() + system.cpu_clk_domain = SrcClockDomain(clock=options.CPUClock, + voltage_domain= + system.cpu_voltage_domain) + + # Configure the TLB hierarchy. + GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx) + + system.piobus = IOXBar(width=32, response_latency=0, + frontend_latency=0, forward_latency=0) + + # Note this implicit setting of CPU pointer, shader pointer and TLB array + # parameters must be after the explicit setting of the system CPU list. + shader.cpu_pointer = host_cpu + dispatcher.pio = system.piobus.master + dispatcher.dma = system.piobus.slave + dispatcher.cpu = host_cpu + dispatcher.shader_pointer = shader + dispatcher.cl_driver = cl_driver + + Ruby.create_system(options, None, system) + + system.ruby.clk_domain = SrcClockDomain(clock=options.ruby_clock, + voltage_domain=system.voltage_domain) + for i in xrange(options.num_cpus): + ruby_port = system.ruby._cpu_ports[i] + + # Create interrupt controller + system.cpu[i].createInterruptController() + + # Connect cache port's to ruby + system.cpu[i].icache_port = ruby_port.slave + system.cpu[i].dcache_port = ruby_port.slave + + ruby_port.mem_master_port = system.piobus.slave + if buildEnv["TARGET_ISA"] == "x86": + system.cpu[i].interrupts[0].pio = system.piobus.master + system.cpu[i].interrupts[0].int_master = system.piobus.slave + system.cpu[i].interrupts[0].int_slave = system.piobus.master + + # Because of the peculiarities of the CP core, you may have one CPU + # but two sequencers and thus two cpu ports created. The GPUs + # shouldn't be hooked up until after the CP. To make this script + # generic, figure out the index as below, but note that this assumes + # that there is one sequencer per compute unit and one sequencer per + # SQC for the math to work out correctly. + gpu_port_idx = len(system.ruby._cpu_ports) \ + - options.num_compute_units \ + - n_sqc \ + - options.num_cp * 2 + for i in xrange(options.num_compute_units): + # The pipeline issues wavefront_size number of uncoalesced requests + # in one GPU issue cycle. Hence wavefront_size mem ports. + for j in xrange(options.wf_size): + system.cpu[shader_idx].CUs[i].memory_port[j] = \ + system.ruby._cpu_ports[gpu_port_idx].slave[j] + gpu_port_idx += 1 + for i in xrange(options.num_compute_units): + if i > 0 and not i % options.cu_per_sqc: + gpu_port_idx += 1 + system.cpu[shader_idx].CUs[i].sqc_port = \ + system.ruby._cpu_ports[gpu_port_idx].slave + gpu_port_idx += 1 + for i in xrange(options.num_cp): + system.cpu[cp_idx].createInterruptController() + system.cpu[cp_idx].dcache_port = \ + system.ruby._cpu_ports[gpu_port_idx+i*2].slave + system.cpu[cp_idx].icache_port = \ + system.ruby._cpu_ports[gpu_port_idx+i*2+1].slave + system.cpu[cp_idx].interrupts.pio = system.piobus.master + system.cpu[cp_idx].interrupts.int_master = system.piobus.slave + system.cpu[cp_idx].interrupts.int_slave = system.piobus.master + cp_idx = cp_idx + 1 + + sys_list.append(system) + +root = Root(full_system=False, system=sys_list) +Simulation.run(options, root, sys_list, FutureClass) diff --git a/configs/example/multi-system-se.py b/configs/example/multi-system-se.py new file mode 100644 --- /dev/null +++ b/configs/example/multi-system-se.py @@ -0,0 +1,203 @@ +# Copyright (c) 2012-2013 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Copyright (c) 2006-2008 The Regents of The University of Michigan +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Brandon Potter +# +# Multi-system test script (adapted from configs/example/se.py). + +import optparse +import sys +import os + +import m5 +from m5.defines import buildEnv +from m5.objects import * +from m5.util import addToPath, fatal + +addToPath('../common') +addToPath('../ruby') + +import Options +import Simulation +import CacheConfig +import MemConfig +import Ruby + +def get_processes(options): + sys_workloads = options.cmd.split(':') + sys_inputs = options.input.split(':') if options.input else [] + sys_outputs = options.output.split(':') if options.output else [] + sys_errouts = options.errout.split(':') if options.errout else [] + sys_pargs = options.options.split(':') if options.options else [] + + sys_multiprocesses = [] + for i in xrange(len(sys_workloads)): + workloads = sys_workloads[i].split(';') + inputs = sys_inputs[i].split(';') if sys_inputs else [] + outputs = sys_outputs[i].split(';') if sys_outputs else [] + errouts = sys_errouts[i].split(';') if sys_errouts else [] + pargs = sys_pargs[i].split(';') if sys_pargs else [] + + index = 0 + multiprocesses = [] + for workload in workloads: + process = LiveProcess() + + if options.env: + with open(options.env, 'r') as f: + process.env = [line.rstrip() for line in f] + + process.cwd = os.getcwd() + process.executable = workload + process.cmd = [workload] + if len(pargs) > index: + process.cmd += pargs[index].split() + if len(inputs) > index: + process.input = inputs[index] + if len(outputs) > index: + process.output = outputs[index] + if len(errouts) > index: + process.errout = errouts[index] + multiprocesses.append(process) + index += 1 + + sys_multiprocesses.append(multiprocesses) + + return sys_multiprocesses + + +parser = optparse.OptionParser() +Options.addCommonOptions(parser) +Options.addSEOptions(parser) + +parser.add_option("--num-systems", type="int", default=1, + help="Number of copies of the system to create") + +if '--ruby' in sys.argv: + Ruby.define_options(parser) + +(options, args) = parser.parse_args() + +if args: + fatal("configuration script failed to parse all arguments") +elif not options.cmd: + fatal("must specify workload(s)") +# Choosing to ignore the more advanced options for script brevity. +# The point here is to show how to setup a multi-system configuration. +elif options.smt: + fatal("smt disabled") +elif options.fastmem: + fatal("fastmem disabled") +elif options.simpoint_profile: + fatal("simpoint disabled") +elif options.checker: + fatal("checker disabled") + +sys_multiprocesses = get_processes(options) +assert(len(sys_multiprocesses) == options.num_systems) + +(CPUClass, test_mem_mode, FutureClass) = Simulation.setCPUClass(options) + +cpu_class_list = [] +sys_list = [] +for s in xrange(options.num_systems): + cpu_class_list.append(Simulation.setCPUClass(options)[0]) + + system = System(cpu=[cpu_class_list[s](cpu_id=i) \ + for i in xrange(options.num_cpus)], + mem_mode=test_mem_mode, + mem_ranges=[AddrRange(options.mem_size)], + cache_line_size=options.cacheline_size) + + system.voltage_domain = VoltageDomain(voltage=options.sys_voltage) + system.cpu_voltage_domain = VoltageDomain() + system.clk_domain = SrcClockDomain(clock=options.sys_clock, + voltage_domain=system.voltage_domain) + system.cpu_clk_domain = SrcClockDomain(clock=options.cpu_clock, + voltage_domain= + system.cpu_voltage_domain) + for cpu in system.cpu: + cpu.clk_domain = system.cpu_clk_domain + + sys_list.append(system) + + for i in xrange(options.num_cpus): + sys_list[s].cpu[i].createThreads() + # Quick and dirty fix to resolve the issue of being forced to + # assign workloads to every CPU. The CPUs remain inactive until + # thread contexts are manually activated. + sys_list[s].cpu[i].workload = sys_multiprocesses[s][0] + if i < len(sys_multiprocesses[s]): + sys_list[s].cpu[i].workload = sys_multiprocesses[s][i] + + if options.ruby: + if not (options.cpu_type in ("detailed", "timing")): + fatal("Ruby requires CPU with a timing model!") + + Ruby.create_system(options, False, sys_list[s]) + + sys_list[s].ruby.clk_domain = \ + SrcClockDomain(clock=options.ruby_clock, + voltage_domain=sys_list[s].voltage_domain) + + for i in xrange(options.num_cpus): + ruby_port = sys_list[s].ruby._cpu_ports[i] + + # Create the interrupt controller and connect its ports to Ruby + # Note that the interrupt controller is always present but only + # in x86 does it have message ports that need to be connected + sys_list[s].cpu[i].createInterruptController() + + sys_list[s].cpu[i].icache_port = ruby_port.slave + sys_list[s].cpu[i].dcache_port = ruby_port.slave + if buildEnv['TARGET_ISA'] == 'x86': + sys_list[s].cpu[i].interrupts[0].pio = ruby_port.master + sys_list[s].cpu[i].interrupts[0].int_master = ruby_port.slave + sys_list[s].cpu[i].interrupts[0].int_slave = ruby_port.master + sys_list[s].cpu[i].itb.walker.port = ruby_port.slave + sys_list[s].cpu[i].dtb.walker.port = ruby_port.slave + else: + MemClass = Simulation.setMemClass(options) + sys_list[s].membus = SystemXBar() + sys_list[s].system_port = sys_list[s].membus.slave + CacheConfig.config_cache(options, sys_list[s]) + MemConfig.config_mem(options, sys_list[s]) + +root = Root(full_system=False, system=sys_list) +Simulation.run(options, root, sys_list, FutureClass) diff --git a/src/mem/ruby/network/MessageBuffer.hh b/src/mem/ruby/network/MessageBuffer.hh --- a/src/mem/ruby/network/MessageBuffer.hh +++ b/src/mem/ruby/network/MessageBuffer.hh @@ -37,7 +37,6 @@ #include #include #include -#include #include #include @@ -159,6 +158,8 @@ int m_input_link_id; int m_vnet_id; + + RubySystem *m_ruby_system; }; Tick random_time(); diff --git a/src/mem/ruby/network/MessageBuffer.cc b/src/mem/ruby/network/MessageBuffer.cc --- a/src/mem/ruby/network/MessageBuffer.cc +++ b/src/mem/ruby/network/MessageBuffer.cc @@ -44,7 +44,7 @@ m_max_size(p->buffer_size), m_time_last_time_size_checked(0), m_time_last_time_enqueue(0), m_time_last_time_pop(0), m_last_arrival_time(0), m_strict_fifo(p->ordered), - m_randomization(p->randomization) + m_randomization(p->randomization), m_ruby_system(p->ruby_system) { m_msg_counter = 0; m_consumer = NULL; @@ -151,7 +151,7 @@ assert(delta > 0); Tick arrival_time = 0; - if (!RubySystem::getRandomization() || !m_randomization) { + if (!m_ruby_system->getRandomization() || !m_randomization) { // No randomization arrival_time = current_time + delta; } else { @@ -178,7 +178,7 @@ } // If running a cache trace, don't worry about the last arrival checks - if (!RubySystem::getWarmupEnabled()) { + if (!m_ruby_system->getWarmupEnabled()) { m_last_arrival_time = arrival_time; } diff --git a/src/mem/ruby/network/MessageBuffer.py b/src/mem/ruby/network/MessageBuffer.py --- a/src/mem/ruby/network/MessageBuffer.py +++ b/src/mem/ruby/network/MessageBuffer.py @@ -38,6 +38,7 @@ buffer_size = Param.Unsigned(0, "Maximum number of entries to buffer \ (0 allows infinite entries)") randomization = Param.Bool(False, "") + ruby_system = Param.RubySystem(Parent.any, "Parent RubySystem object") master = MasterPort("Master port to MessageBuffer receiver") slave = SlavePort("Slave port from MessageBuffer sender") diff --git a/src/mem/ruby/network/Network.py b/src/mem/ruby/network/Network.py --- a/src/mem/ruby/network/Network.py +++ b/src/mem/ruby/network/Network.py @@ -28,6 +28,7 @@ # Brad Beckmann from m5.params import * +from m5.proxy import * from ClockedObject import ClockedObject from BasicLink import BasicLink @@ -47,7 +48,7 @@ block_size_bytes = Param.UInt32(64, "block size used for data messages.") control_msg_size = Param.UInt32(8, "") - ruby_system = Param.RubySystem("") + ruby_system = Param.RubySystem(Parent.any, "Parent RubySystem object") routers = VectorParam.BasicRouter("Network routers") netifs = VectorParam.ClockedObject("Network Interfaces") diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc --- a/src/mem/ruby/profiler/Profiler.cc +++ b/src/mem/ruby/profiler/Profiler.cc @@ -354,7 +354,7 @@ for (uint32_t i = 0; i < MachineType_NUM; i++) { for (map::iterator it = m_ruby_system->m_abstract_controls[i].begin(); - it != m_ruby_system->m_abstract_controls[i].end(); ++it) { + it != m_ruby_system->m_abstract_controls[i].end(); ++it) { AbstractController *ctr = (*it).second; Sequencer *seq = ctr->getCPUSequencer(); @@ -371,7 +371,7 @@ for (uint32_t i = 0; i < MachineType_NUM; i++) { for (map::iterator it = m_ruby_system->m_abstract_controls[i].begin(); - it != m_ruby_system->m_abstract_controls[i].end(); ++it) { + it != m_ruby_system->m_abstract_controls[i].end(); ++it) { AbstractController *ctr = (*it).second; Sequencer *seq = ctr->getCPUSequencer(); diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh --- a/src/mem/ruby/slicc_interface/AbstractController.hh +++ b/src/mem/ruby/slicc_interface/AbstractController.hh @@ -141,6 +141,7 @@ const NodeID m_version; MachineID m_machineID; const NodeID m_clusterID; + RubySystem *m_ruby_system; // MasterID used by some components of gem5. const MasterID m_masterId; diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc --- a/src/mem/ruby/slicc_interface/AbstractController.cc +++ b/src/mem/ruby/slicc_interface/AbstractController.cc @@ -38,6 +38,7 @@ AbstractController::AbstractController(const Params *p) : MemObject(p), Consumer(this), m_version(p->version), m_clusterID(p->cluster_id), + m_ruby_system(p->ruby_system), m_masterId(p->system->getMasterId(name())), m_is_blocking(false), m_block_size_bytes(p->block_size_bytes), m_block_size_bits(floorLog2(m_block_size_bytes)), @@ -47,13 +48,12 @@ memoryPort(csprintf("%s.memory", name()), this, "") { assert(isPowerOf2(m_block_size_bytes)); - } void AbstractController::init() { - params()->ruby_system->registerAbstractController(this); + m_ruby_system->registerAbstractController(this); m_delayHistogram.init(10); uint32_t size = Network::getNumberOfVirtualNetworks(); for (uint32_t i = 0; i < size; i++) { @@ -226,7 +226,7 @@ pkt->pushSenderState(s); // Use functional rather than timing accesses during warmup - if (RubySystem::getWarmupEnabled()) { + if (m_ruby_system->getWarmupEnabled()) { memoryPort.sendFunctional(pkt); recvTimingResp(pkt); return; @@ -250,7 +250,7 @@ pkt->pushSenderState(s); // Use functional rather than timing accesses during warmup - if (RubySystem::getWarmupEnabled()) { + if (m_ruby_system->getWarmupEnabled()) { memoryPort.sendFunctional(pkt); recvTimingResp(pkt); return; diff --git a/src/mem/ruby/slicc_interface/Controller.py b/src/mem/ruby/slicc_interface/Controller.py --- a/src/mem/ruby/slicc_interface/Controller.py +++ b/src/mem/ruby/slicc_interface/Controller.py @@ -47,7 +47,7 @@ recycle_latency = Param.Cycles(10, "") number_of_TBEs = Param.Int(256, "") - ruby_system = Param.RubySystem("") + ruby_system = Param.RubySystem(Parent.any, "Parent RubySystem object") memory = MasterPort("Port for attaching a memory controller") system = Param.System(Parent.any, "system object parameter") diff --git a/src/mem/ruby/structures/TimerTable.cc b/src/mem/ruby/structures/TimerTable.cc --- a/src/mem/ruby/structures/TimerTable.cc +++ b/src/mem/ruby/structures/TimerTable.cc @@ -33,7 +33,7 @@ { m_consumer_ptr = NULL; m_next_valid = false; - m_next_address = 0; + m_next_address = Addr(0); } bool diff --git a/src/mem/ruby/system/RubySystem.hh b/src/mem/ruby/system/RubySystem.hh --- a/src/mem/ruby/system/RubySystem.hh +++ b/src/mem/ruby/system/RubySystem.hh @@ -71,10 +71,10 @@ ~RubySystem(); // config accessors - static int getRandomization() { return m_randomization; } - static uint32_t getMemorySizeBits() { return m_memory_size_bits; } - static bool getWarmupEnabled() { return m_warmup_enabled; } - static bool getCooldownEnabled() { return m_cooldown_enabled; } + int getRandomization() { return m_randomization; } + uint32_t getMemorySizeBits() { return m_memory_size_bits; } + bool getWarmupEnabled() { return m_warmup_enabled; } + bool getCooldownEnabled() { return m_cooldown_enabled; } SimpleMemory *getPhysMem() { return m_phys_mem; } Cycles getStartCycle() { return m_start_cycle; } @@ -128,14 +128,14 @@ private: // configuration parameters - static bool m_randomization; - const uint32_t m_block_size_bytes; - const uint32_t m_block_size_bits; - static uint32_t m_memory_size_bits; + bool m_randomization; + uint32_t m_block_size_bytes; + uint32_t m_block_size_bits; + uint32_t m_memory_size_bits; - static bool m_warmup_enabled; + bool m_warmup_enabled; + bool m_cooldown_enabled; static unsigned m_systems_to_warmup; - static bool m_cooldown_enabled; SimpleMemory *m_phys_mem; const bool m_access_backing_store; diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc --- a/src/mem/ruby/system/RubySystem.cc +++ b/src/mem/ruby/system/RubySystem.cc @@ -46,13 +46,7 @@ using namespace std; -bool RubySystem::m_randomization; -uint32_t RubySystem::m_memory_size_bits; -bool RubySystem::m_warmup_enabled = false; -// To look forward to allowing multiple RubySystem instances, track the number -// of RubySystems that need to be warmed up on checkpoint restore. unsigned RubySystem::m_systems_to_warmup = 0; -bool RubySystem::m_cooldown_enabled = false; RubySystem::RubySystem(const Params *p) : ClockedObject(p), m_block_size_bytes(p->block_size_bytes), @@ -62,6 +56,9 @@ { m_randomization = p->randomization; + m_warmup_enabled = false; + m_cooldown_enabled = false; + assert(isPowerOf2(m_block_size_bytes)); m_memory_size_bits = p->memory_size_bits; @@ -249,7 +246,7 @@ // Aggregate the trace entries together into a single array uint8_t *raw_data = new uint8_t[4096]; uint64_t cache_trace_size = m_cache_recorder->aggregateRecords(&raw_data, - 4096); + 4096); string cache_trace_file = name() + ".cache.gz"; writeCompressedTrace(raw_data, cache_trace_file, cache_trace_size); @@ -388,10 +385,10 @@ void RubySystem::RubyEvent::process() { - if (RubySystem::getWarmupEnabled()) { + if (m_ruby_system->getWarmupEnabled()) { m_ruby_system->m_cache_recorder->enqueueNextFetchRequest( m_ruby_system->m_block_size_bytes); - } else if (RubySystem::getCooldownEnabled()) { + } else if (m_ruby_system->getCooldownEnabled()) { m_ruby_system->m_cache_recorder->enqueueNextFlushRequest(); } } diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh --- a/src/mem/ruby/system/Sequencer.hh +++ b/src/mem/ruby/system/Sequencer.hh @@ -148,6 +148,9 @@ Stats::Counter getIncompleteTimes(const MachineType t) const { return m_IncompleteTimes[t]; } + protected: + RubySystem *m_ruby_system; + private: void issueRequest(PacketPtr pkt, RubyRequestType type); diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc --- a/src/mem/ruby/system/Sequencer.cc +++ b/src/mem/ruby/system/Sequencer.cc @@ -55,6 +55,7 @@ : RubyPort(p), m_IncompleteTimes(MachineType_NUM), deadlockCheckEvent(this) { + m_ruby_system = p->ruby_system; m_outstanding_count = 0; m_instCache_ptr = p->icache; @@ -452,7 +453,7 @@ total_latency); // update the data unless it is a non-data-carrying flush - if (RubySystem::getWarmupEnabled()) { + if (m_ruby_system->getWarmupEnabled()) { data.setData(pkt->getConstPtr(), offset_address, pkt->getSize()); } else if (!pkt->isFlush()) { @@ -487,15 +488,15 @@ delete srequest; - RubySystem *rs = m_ruby_system; - if (RubySystem::getWarmupEnabled()) { + if (m_ruby_system->getWarmupEnabled()) { assert(pkt->req); delete pkt->req; delete pkt; - rs->m_cache_recorder->enqueueNextFetchRequest(m_block_size_bytes); - } else if (RubySystem::getCooldownEnabled()) { + m_ruby_system->m_cache_recorder->\ + enqueueNextFetchRequest(m_block_size_bytes); + } else if (m_ruby_system->getCooldownEnabled()) { delete pkt; - rs->m_cache_recorder->enqueueNextFlushRequest(); + m_ruby_system->m_cache_recorder->enqueueNextFlushRequest(); } else { ruby_hit_callback(pkt); testDrainComplete(); diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py --- a/src/mem/ruby/system/Sequencer.py +++ b/src/mem/ruby/system/Sequencer.py @@ -46,7 +46,7 @@ using_ruby_tester = Param.Bool(False, "") no_retry_on_stall = Param.Bool(False, "") - ruby_system = Param.RubySystem(Parent.any, "") + ruby_system = Param.RubySystem(Parent.any, "Parent RubySystem object") system = Param.System(Parent.any, "system object") block_size_bytes = Param.UInt32(64, "cache block size") support_data_reqs = Param.Bool(True, "data cache requests supported")