# Node ID 69918440479476a3be5064c74fca61ea35cea842
# Parent  32d1739cbfd76a7dd420ee4039ffeb23d7dcade8
diff --git a/configs/example/multi-system-apu.py b/configs/example/multi-system-apu.py
new file mode 100644
--- /dev/null
+++ b/configs/example/multi-system-apu.py
@@ -0,0 +1,519 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright
+#  notice, this list of conditions and the following disclaimer in the
+#  documentation and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its
+#  contributors may be used to endorse or promote products derived from this
+#  software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+#  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#  Authors: Sooraj Puthoor
+#           Brandon Potter
+#
+
+import glob
+import math
+import optparse
+import os
+import re
+
+import m5
+from m5.objects import *
+from m5.util import addToPath, fatal
+
+addToPath("../ruby")
+addToPath("../common")
+addToPath("../topologies")
+
+import Options
+import Ruby
+import Simulation
+import GPUTLBOptions
+import GPUTLBConfig
+
+
+# Search for the file or directory in the search path and return it. If the
+# search is unsuccessful then fail out of configuration.
+def find_path(base_list, rel_path, test):
+    for base in base_list:
+        if not base:
+            continue
+        full_path = os.path.join(base, rel_path)
+        if test(full_path):
+            return full_path
+    fatal("%s not found in %s" % (rel_path, base_list))
+
+
+# Special instance of the find_path function. Just wrap it with the
+# condition.
+def find_file(base_list, rel_path):
+    return find_path(base_list, rel_path, os.path.isfile)
+
+
+# Set the option if the option is defined or fail out of the configuration.
+def setOption(parser, opt_str, value=1):
+    if parser.has_option(opt_str):
+        opt = parser.get_option(opt_str)
+        exec("parser.values.%s = %s" % (opt.dest, value))
+        return
+    raise Exception("cannot find %s in list of possible options" % opt_str)
+
+
+# Get the option if the option is defined or fail out of the configuration.
+def getOption(parser, opt_str):
+    if parser.has_option(opt_str):
+        opt = parser.get_option(opt_str)
+        exec("return_value = parser.values.%s" % opt.dest)
+        return return_value
+    raise Exception("cannot find %s in list of possible options" % opt_str)
+
+
+gpu_map_vaddr = 0x10000000
+gpu_map_paddr = 0x200000000
+gpu_map_size = 4096
+class GPUProcess(LiveProcess):
+    # Every process that wants to access the GPU has to map the device
+    # into its address space.  That can't happen until after the C++
+    # object has been created through, so we have to do it via this
+    # callback hook.
+    def startup(self):
+        self.map(gpu_map_vaddr, gpu_map_paddr, gpu_map_size, False)
+        self._ccObject.startup()
+
+
+# Script options.
+parser = optparse.OptionParser()
+Ruby.define_options(parser)
+Options.addCommonOptions(parser)
+Options.addSEOptions(parser)
+GPUTLBOptions.tlb_options(parser)
+
+parser.add_option("--num-systems", type="int", default=1,
+                  help="Number of copies of the system to create")
+parser.add_option("-u", "--num-compute-units", type="int", default=1,
+                  help="Number of GPU compute units"),
+parser.add_option("--num-cp", type="int", default=0,
+                  help="Number of GPU Command Processors (CP)")
+parser.add_option("-k", "--kernel-files",
+                  help="File(s) containing GPU kernels (colon separated)")
+parser.add_option("--benchmark-root",
+                  help="Root of benchmark directory tree")
+parser.add_option("--cpu-only-mode", action="store_true", default=False,
+                  help="APU mode. Used to take care of problems in "
+                       "Ruby.py while running APU protocols")
+
+parser.add_option("--cu-per-sqc", type="int", default=4,
+                  help="Number of CUs sharing an SQC (icache and i-TLB)")
+parser.add_option("--simds-per-cu", type="int", default=4,
+                  help="SIMD units per CU")
+parser.add_option("--wf-size", type="int", default=64,
+                  help="Wavefront size(in workitems)")
+parser.add_option("--sp-bypass-path-length", type="int", default=4,
+                  help="Number of stages of bypass path in vector ALU for "
+                       "Single Precision ops")
+parser.add_option("--dp-bypass-path-length", type="int", default=4,
+                  help="Number of stages of bypass path in vector ALU for "
+                       "Double Precision ops")
+
+parser.add_option("--issue-period", type="int", default=4,
+                  help="Cycles per vector instruction issue period")
+parser.add_option("--glbmem-wr-bus-width", type="int", default=32,
+                  help="VGPR to Coalescer (Global Memory) data bus width "
+                       "in bytes")
+parser.add_option("--glbmem-rd-bus-width", type="int", default=32,
+                  help="Coalescer to VGPR (Global Memory) data bus width "
+                       "in bytes")
+parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1,
+                  help="Number of Shared Memory pipelines per CU")
+parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1,
+                  help="Number of Global Memory pipelines per CU")
+parser.add_option("--wfs-per-simd", type="int", default=10,
+                  help="Number of WF slots per SIMD")
+
+parser.add_option("--vreg-file-size", type="int", default=2048,
+                  help="Number of physical vector registers per SIMD")
+parser.add_option("--bw-scalor", type="int", default=0,
+                  help="Bandwidth scalor for scalability analysis")
+parser.add_option("--CPUClock", type="string", default="2GHz",
+                  help="CPU clock")
+parser.add_option("--GPUClock", type="string", default="1GHz",
+                  help="GPU clock")
+parser.add_option("--cpu-voltage", action="store", type="string",
+                  default='1.0V',
+                  help="CPU voltage domain")
+parser.add_option("--gpu-voltage", action="store", type="string",
+                  default='1.0V',
+                  help="GPU voltage domain")
+parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
+                  help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
+parser.add_option("--xact-cas-mode", action="store_true",
+                  help="Enable load_compare mode (transactional CAS)")
+parser.add_option("--SegFaultDebug",action="store_true",
+                  help="Checks for GPU seg fault before TLB access")
+parser.add_option("--FunctionalTLB",action="store_true",
+                  help="Assumes TLB has no latency")
+parser.add_option("--LocalMemBarrier",action="store_true",
+                  help="Barrier does not wait for writethrough completion")
+parser.add_option("--countPages", action="store_true",
+                  help="Count page access and output in per-CU output file")
+parser.add_option("--TLB-prefetch", type="int",
+                  help="Prefetch depth for TLBs")
+parser.add_option("--pf-type", type="string",
+                  help="Type of prefetch: PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
+parser.add_option("--pf-stride", type="int",
+                  help="Set prefetch stride")
+parser.add_option("--numLdsBanks", type="int", default=32,
+                  help="Number of physical banks per LDS module")
+parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
+                  help="Number of cycles per LDS bank conflict")
+
+
+# Parse the arguments and fail out of the configuration if some basic
+# constraints are not met. The parser needs to successfully parse every
+# argument and the simulation needs workloads to be specified.
+(options, args) = parser.parse_args()
+if args:
+    fatal("some arguments were not parsed by the option parser")
+elif not options.cmd:
+    fatal("no workload(s) specified")
+
+
+# GPU cache coherence protocols rely on the backing store to function.
+setOption(parser, "--access-backing-store")
+
+# Currently the gpu model requires ruby
+if buildEnv['PROTOCOL'] == 'None':
+    fatal("GPU model requires ruby")
+
+# Currently the gpu model requires only timing or detailed CPU
+if not (options.cpu_type == "timing" or
+   options.cpu_type == "detailed"):
+    fatal("GPU model requires timing or detailed CPU")
+
+
+benchmark_path = ['.']
+if options.benchmark_root:
+    benchmark_path = [options.benchmark_root]
+
+# Examine options and instantiate GPUProcess class for each workload.
+def get_processes(options):
+    sys_workloads = options.cmd.split(":")
+    sys_inputs = options.input.split(":") if options.input else []
+    sys_outputs = options.output.split(":") if options.output else []
+    sys_errouts = options.errout.split(":") if options.errout else []
+    sys_pargs = options.options.split(":") if options.options else []
+
+    sys_kernel_files = [find_file(benchmark_path, f)
+                        for f in options.kernel_files.split(":")]
+
+    sys_multiprocesses = []
+
+    for i in xrange(len(sys_workloads)):
+        workloads = sys_workloads[i].split(";")
+        inputs = sys_inputs[i].split(";") if sys_inputs else []
+        outputs = sys_outputs[i].split(";") if sys_outputs else []
+        errouts = sys_errouts[i].split(";") if sys_errouts else []
+        process_args = sys_pargs[i].split(";") if sys_pargs else []
+
+        index = 0
+        multiprocesses = []
+
+        for workload in workloads:
+            process = GPUProcess()
+
+            process.cwd = os.getcwd()
+            process.executable = workload
+            process.cmd = [workload]
+            if len(process_args) > index:
+                process.cmd += process_args[index].split()
+            if len(inputs) > index:
+                process.input = inputs[index]
+            if len(outputs) > index:
+                process.output = outputs[index]
+            if len(errouts) > index:
+                process.errout = errouts[index]
+            multiprocesses.append(process)
+            index += 1
+
+        sys_multiprocesses.append(multiprocesses)
+
+    num_threads = [1 for i in xrange(len(sys_workloads))]
+
+    # Assumes that this will run without SMT.  Need to work on the options
+    # framework to come up with a way to support multi-system configurations
+    # with multiple CPUs and SMT.
+    return sys_multiprocesses, sys_kernel_files, num_threads
+
+
+multiprocesses, sys_kernel_files, num_threads = get_processes(options)
+assert(len(multiprocesses) == options.num_systems)
+
+(CPUClass, test_mem_mode, FutureClass) = Simulation.setCPUClass(options)
+cpu_class_list = []
+for s in xrange(options.num_systems):
+    cpu_class_list.append(Simulation.setCPUClass(options)[0])
+    cpu_class_list[-1].numThreads = num_threads[s]
+
+
+# This file can support multiple compute units
+assert(options.num_compute_units >= 1)
+
+# Currently, the SQC (GPU's icache) is shared by multiple compute
+# units (CUs). The protocol works just fine even if SQC is not shared.
+# Overriding this option here so that the user need not explicitly set
+# this (assuming sharing SQC is the common usage)
+n_sqc = int(math.ceil(float(options.num_compute_units) / options.cu_per_sqc))
+options.num_sqc = n_sqc
+
+sys_list = []
+
+for s in xrange(options.num_systems):
+    shader = Shader(n_wf=options.wfs_per_simd,
+                    clk_domain=SrcClockDomain(
+                            clock=options.GPUClock,
+                            voltage_domain=VoltageDomain(
+                                    voltage=options.gpu_voltage)))
+    # GPU_RfO(Read For Ownership) implements SC/TSO memory model.
+    # Other GPU protocols implement release consistency at GPU side.
+    # So, all GPU protocols other than GPU_RfO should make their writes
+    # visible to the global memory and should read from global memory
+    # during kernal boundary. The pipeline initiates(or do not initiate)
+    # the acquire/release operation depending on this impl_kern_boundary_sync
+    # flag. This flag=true means pipeline initiates a acquire/release
+    # operation at kernel boundary.
+    if buildEnv["PROTOCOL"] == "GPU_RfO":
+        shader.impl_kern_boundary_sync = False
+    else:
+        shader.impl_kern_boundary_sync = True
+
+    compute_units = []
+    per_lane = True if options.TLB_config == "perLane" else False
+    for i in xrange(options.num_compute_units):
+        CU = ComputeUnit(cu_id=i, perLaneTLB=per_lane,
+                         num_SIMDs=options.simds_per_cu,
+                         wfSize=options.wf_size,
+                         spbypass_pipe_length=options.sp_bypass_path_length,
+                         dpbypass_pipe_length=options.dp_bypass_path_length,
+                         coalescer_to_vrf_bus_width=\
+                         options.glbmem_rd_bus_width,
+                         vrf_to_coalescer_bus_width=\
+                         options.glbmem_wr_bus_width,
+                         num_global_mem_pipes=options.glb_mem_pipes_per_cu,
+                         num_shared_mem_pipes=options.shr_mem_pipes_per_cu,
+                         n_wf=options.wfs_per_simd,
+                         execPolicy=options.CUExecPolicy,
+                         xactCasMode=options.xact_cas_mode,
+                         debugSegFault=options.SegFaultDebug,
+                         functionalTLB=options.FunctionalTLB,
+                         localMemBarrier=options.LocalMemBarrier,
+                         countPages=options.countPages,
+                         localDataStore=\
+                         LdsState(banks=options.numLdsBanks,
+                                  bankConflictPenalty=\
+                                  options.ldsBankConflictPenalty))
+        wavefronts = []
+        vrfs = []
+        for j in xrange(options.simds_per_cu):
+            for k in xrange(shader.n_wf):
+                wavefronts.append(Wavefront(simdId=j, wf_slot_id=k))
+            vrfs.append(VectorRegisterFile(simd_id=j,
+                                num_regs_per_simd=options.vreg_file_size))
+        CU.wavefronts = wavefronts
+        CU.vector_register_file = vrfs
+        if options.TLB_prefetch:
+            CU.prefetch_depth = options.TLB_prefetch
+            CU.prefetch_prev_type = options.pf_type
+
+        # Attach the LDS and the CU to the bus (which is actually a bridge).
+        CU.ldsPort = CU.ldsBus.slave
+        CU.ldsBus.master = CU.localDataStore.cuPort
+        compute_units.append(CU)
+
+    # Attach compute units to GPU
+    shader.CUs = compute_units
+
+    cpu_type = "AtomicSimple"
+    if options.cpu_type == "detailed":
+        cpu_type = "DerivO3"
+    if options.cpu_type == "timing":
+        cpu_type = "TimingSimple"
+
+    # The GPU requires Ruby to work which implicitly requires a timing CPU.
+    assert(cpu_type != "AtomicSimple")
+
+    # We only support timing mode for shader and memory
+    shader.timing = True
+    mem_mode = "timing"
+
+    # Create the CPUs.
+    cpu_list = []
+    for i in range(options.num_cpus):
+        exec("cpu = %sCPU(cpu_id=i, \
+                          clk_domain=SrcClockDomain( \
+                              clock=options.CPUClock, \
+                              voltage_domain=VoltageDomain( \
+                                  voltage=options.cpu_voltage)))"
+             % cpu_type)
+        cpu_list.append(cpu)
+
+    # Create the command processors.
+    cp_list = []
+    for i in xrange(options.num_cp):
+        exec("cp = %sCPU(cpu_id=options.num_cpus + i, \
+                         clk_domain=SrcClockDomain( \
+                             clock=options.CPUClock, \
+                             voltage_domain=VoltageDomain( \
+                                 voltage=options.cpu_voltage)))" % cpu_type)
+        cp_list.append(cp)
+
+    host_cpu = cpu_list[0]
+
+    dispatcher = GpuDispatcher()
+
+    # The list ordering here has a special significance.
+    entire_list = cpu_list + [shader] + cp_list + [dispatcher]
+    shader_idx = options.num_cpus
+    cp_idx = shader_idx + 1
+
+    # Create the system object.
+    # Notice the CPU list is explicitly added as a parameter to system.
+    system = System(cpu=entire_list,
+                    mem_mode=mem_mode,
+                    mem_ranges=[AddrRange(options.mem_size)],
+                    cache_line_size=options.cacheline_size)
+
+    kernel_files = [find_file(benchmark_path, f)
+                        for f in sys_kernel_files[s].split(";")]
+    cl_driver = ClDriver(filename="hsa", codefile=kernel_files)
+
+    for i in xrange(options.num_cpus):
+        if i >= len(multiprocesses[s]):
+            system.cpu[i].workload = multiprocesses[s][0]
+        else:
+            # Choosing to clobber whatever was in multiprocesses[s][i]
+            # because we need to set values for it which we did not know
+            # when the GPUProcess was previously created. (It does not seem
+            # that these process entries can be changed in Python after
+            # they are set.) Also, the executables can be different, but we
+            # assume that they're all the same to simplify this code.
+            multiprocesses[s][i] = GPUProcess(executable=\
+                                   multiprocesses[s][i].executable,
+                                   cmd=multiprocesses[s][i].cmd,
+                                   cwd=multiprocesses[s][i].cwd,
+                                   input=multiprocesses[s][i].input,
+                                   output=multiprocesses[s][i].output,
+                                   errout=multiprocesses[s][i].errout,
+                                   drivers=[cl_driver])
+            if options.env:
+                with open(options.env, "r") as f:
+                    multiprocesses[s][i].env = [line.rstrip() for line in f]
+            system.cpu[i].workload = multiprocesses[s][i]
+
+    for cp in cp_list:
+        cp.workload = host_cpu.workload
+
+    system.voltage_domain = VoltageDomain(voltage=options.sys_voltage)
+    system.clk_domain = SrcClockDomain(clock=options.sys_clock,
+                                       voltage_domain=system.voltage_domain)
+    system.cpu_voltage_domain = VoltageDomain()
+    system.cpu_clk_domain = SrcClockDomain(clock=options.CPUClock,
+                                           voltage_domain=
+                                           system.cpu_voltage_domain)
+
+    # Configure the TLB hierarchy.
+    GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
+
+    system.piobus = IOXBar(width=32, response_latency=0,
+                           frontend_latency=0, forward_latency=0)
+
+    # Note this implicit setting of CPU pointer, shader pointer and TLB array
+    # parameters must be after the explicit setting of the system CPU list.
+    shader.cpu_pointer = host_cpu
+    dispatcher.pio = system.piobus.master
+    dispatcher.dma = system.piobus.slave
+    dispatcher.cpu = host_cpu
+    dispatcher.shader_pointer = shader
+    dispatcher.cl_driver = cl_driver
+
+    Ruby.create_system(options, None, system)
+
+    system.ruby.clk_domain = SrcClockDomain(clock=options.ruby_clock,
+                                    voltage_domain=system.voltage_domain)
+    for i in xrange(options.num_cpus):
+        ruby_port = system.ruby._cpu_ports[i]
+
+        # Create interrupt controller
+        system.cpu[i].createInterruptController()
+
+        # Connect cache port's to ruby
+        system.cpu[i].icache_port = ruby_port.slave
+        system.cpu[i].dcache_port = ruby_port.slave
+
+        ruby_port.mem_master_port = system.piobus.slave
+        if buildEnv["TARGET_ISA"] == "x86":
+            system.cpu[i].interrupts[0].pio = system.piobus.master
+            system.cpu[i].interrupts[0].int_master = system.piobus.slave
+            system.cpu[i].interrupts[0].int_slave = system.piobus.master
+
+    # Because of the peculiarities of the CP core, you may have one CPU
+    # but two sequencers and thus two cpu ports created. The GPUs
+    # shouldn't be hooked up until after the CP. To make this script
+    # generic, figure out the index as below, but note that this assumes
+    # that there is one sequencer per compute unit and one sequencer per
+    # SQC for the math to work out correctly.
+    gpu_port_idx = len(system.ruby._cpu_ports) \
+                   - options.num_compute_units \
+                   - n_sqc                     \
+                   - options.num_cp * 2
+    for i in xrange(options.num_compute_units):
+        # The pipeline issues wavefront_size number of uncoalesced requests
+        # in one GPU issue cycle. Hence wavefront_size mem ports.
+        for j in xrange(options.wf_size):
+            system.cpu[shader_idx].CUs[i].memory_port[j] = \
+                              system.ruby._cpu_ports[gpu_port_idx].slave[j]
+        gpu_port_idx += 1
+    for i in xrange(options.num_compute_units):
+        if i > 0 and not i % options.cu_per_sqc:
+            gpu_port_idx += 1
+        system.cpu[shader_idx].CUs[i].sqc_port = \
+                system.ruby._cpu_ports[gpu_port_idx].slave
+    gpu_port_idx += 1
+    for i in xrange(options.num_cp):
+        system.cpu[cp_idx].createInterruptController()
+        system.cpu[cp_idx].dcache_port = \
+                    system.ruby._cpu_ports[gpu_port_idx+i*2].slave
+        system.cpu[cp_idx].icache_port = \
+                    system.ruby._cpu_ports[gpu_port_idx+i*2+1].slave
+        system.cpu[cp_idx].interrupts.pio = system.piobus.master
+        system.cpu[cp_idx].interrupts.int_master = system.piobus.slave
+        system.cpu[cp_idx].interrupts.int_slave = system.piobus.master
+        cp_idx = cp_idx + 1
+
+    sys_list.append(system)
+
+root = Root(full_system=False, system=sys_list)
+Simulation.run(options, root, sys_list, FutureClass)
diff --git a/configs/example/multi-system-se.py b/configs/example/multi-system-se.py
new file mode 100644
--- /dev/null
+++ b/configs/example/multi-system-se.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2012-2013 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Copyright (c) 2006-2008 The Regents of The University of Michigan
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Brandon Potter
+#
+# Multi-system test script (adapted from configs/example/se.py).
+
+import optparse
+import sys
+import os
+
+import m5
+from m5.defines import buildEnv
+from m5.objects import *
+from m5.util import addToPath, fatal
+
+addToPath('../common')
+addToPath('../ruby')
+
+import Options
+import Simulation
+import CacheConfig
+import MemConfig
+import Ruby
+
+def get_processes(options):
+    sys_workloads = options.cmd.split(':')
+    sys_inputs = options.input.split(':') if options.input else []
+    sys_outputs = options.output.split(':') if options.output else []
+    sys_errouts = options.errout.split(':') if options.errout else []
+    sys_pargs = options.options.split(':') if options.options else []
+
+    sys_multiprocesses = []
+    for i in xrange(len(sys_workloads)):
+        workloads = sys_workloads[i].split(';')
+        inputs = sys_inputs[i].split(';') if sys_inputs else []
+        outputs = sys_outputs[i].split(';') if sys_outputs else []
+        errouts = sys_errouts[i].split(';') if sys_errouts else []
+        pargs = sys_pargs[i].split(';') if sys_pargs else []
+
+        index = 0
+        multiprocesses = []
+        for workload in workloads:
+            process = LiveProcess()
+
+            if options.env:
+                with open(options.env, 'r') as f:
+                    process.env = [line.rstrip() for line in f]
+
+            process.cwd = os.getcwd()
+            process.executable = workload
+            process.cmd = [workload]
+            if len(pargs) > index:
+                process.cmd += pargs[index].split()
+            if len(inputs) > index:
+                process.input = inputs[index]
+            if len(outputs) > index:
+                process.output = outputs[index]
+            if len(errouts) > index:
+                process.errout = errouts[index]
+            multiprocesses.append(process)
+            index += 1
+
+        sys_multiprocesses.append(multiprocesses)
+
+    return sys_multiprocesses
+
+
+parser = optparse.OptionParser()
+Options.addCommonOptions(parser)
+Options.addSEOptions(parser)
+
+parser.add_option("--num-systems", type="int", default=1,
+                  help="Number of copies of the system to create")
+
+if '--ruby' in sys.argv:
+    Ruby.define_options(parser)
+
+(options, args) = parser.parse_args()
+
+if args:
+    fatal("configuration script failed to parse all arguments")
+elif not options.cmd:
+    fatal("must specify workload(s)")
+# Choosing to ignore the more advanced options for script brevity.
+# The point here is to show how to setup a multi-system configuration.
+elif options.smt:
+    fatal("smt disabled")
+elif options.fastmem:
+    fatal("fastmem disabled")
+elif options.simpoint_profile:
+    fatal("simpoint disabled")
+elif options.checker:
+    fatal("checker disabled")
+
+sys_multiprocesses = get_processes(options)
+assert(len(sys_multiprocesses) == options.num_systems)
+
+(CPUClass, test_mem_mode, FutureClass) = Simulation.setCPUClass(options)
+
+cpu_class_list = []
+sys_list = []
+for s in xrange(options.num_systems):
+    cpu_class_list.append(Simulation.setCPUClass(options)[0])
+
+    system = System(cpu=[cpu_class_list[s](cpu_id=i) \
+                        for i in xrange(options.num_cpus)],
+                    mem_mode=test_mem_mode,
+                    mem_ranges=[AddrRange(options.mem_size)],
+                    cache_line_size=options.cacheline_size)
+
+    system.voltage_domain = VoltageDomain(voltage=options.sys_voltage)
+    system.cpu_voltage_domain = VoltageDomain()
+    system.clk_domain = SrcClockDomain(clock=options.sys_clock,
+                                       voltage_domain=system.voltage_domain)
+    system.cpu_clk_domain = SrcClockDomain(clock=options.cpu_clock,
+                                           voltage_domain=
+                                           system.cpu_voltage_domain)
+    for cpu in system.cpu:
+        cpu.clk_domain = system.cpu_clk_domain
+
+    sys_list.append(system)
+
+    for i in xrange(options.num_cpus):
+        sys_list[s].cpu[i].createThreads()
+        # Quick and dirty fix to resolve the issue of being forced to
+        # assign workloads to every CPU. The CPUs remain inactive until
+        # thread contexts are manually activated.
+        sys_list[s].cpu[i].workload = sys_multiprocesses[s][0]
+        if i < len(sys_multiprocesses[s]):
+            sys_list[s].cpu[i].workload = sys_multiprocesses[s][i]
+
+    if options.ruby:
+        if not (options.cpu_type in ("detailed", "timing")):
+            fatal("Ruby requires CPU with a timing model!")
+
+        Ruby.create_system(options, False, sys_list[s])
+
+        sys_list[s].ruby.clk_domain = \
+            SrcClockDomain(clock=options.ruby_clock,
+                           voltage_domain=sys_list[s].voltage_domain)
+
+        for i in xrange(options.num_cpus):
+            ruby_port = sys_list[s].ruby._cpu_ports[i]
+
+            # Create the interrupt controller and connect its ports to Ruby
+            # Note that the interrupt controller is always present but only
+            # in x86 does it have message ports that need to be connected
+            sys_list[s].cpu[i].createInterruptController()
+
+            sys_list[s].cpu[i].icache_port = ruby_port.slave
+            sys_list[s].cpu[i].dcache_port = ruby_port.slave
+            if buildEnv['TARGET_ISA'] == 'x86':
+                sys_list[s].cpu[i].interrupts[0].pio = ruby_port.master
+                sys_list[s].cpu[i].interrupts[0].int_master = ruby_port.slave
+                sys_list[s].cpu[i].interrupts[0].int_slave = ruby_port.master
+                sys_list[s].cpu[i].itb.walker.port = ruby_port.slave
+                sys_list[s].cpu[i].dtb.walker.port = ruby_port.slave
+    else:
+        MemClass = Simulation.setMemClass(options)
+        sys_list[s].membus = SystemXBar()
+        sys_list[s].system_port = sys_list[s].membus.slave
+        CacheConfig.config_cache(options, sys_list[s])
+        MemConfig.config_mem(options, sys_list[s])
+
+root = Root(full_system=False, system=sys_list)
+Simulation.run(options, root, sys_list, FutureClass)
diff --git a/src/mem/ruby/network/MessageBuffer.hh b/src/mem/ruby/network/MessageBuffer.hh
--- a/src/mem/ruby/network/MessageBuffer.hh
+++ b/src/mem/ruby/network/MessageBuffer.hh
@@ -37,7 +37,6 @@
 #include <algorithm>
 #include <cassert>
 #include <functional>
-#include <iostream>
 #include <string>
 #include <vector>
 
@@ -159,6 +158,8 @@
 
     int m_input_link_id;
     int m_vnet_id;
+
+    RubySystem *m_ruby_system;
 };
 
 Tick random_time();
diff --git a/src/mem/ruby/network/MessageBuffer.cc b/src/mem/ruby/network/MessageBuffer.cc
--- a/src/mem/ruby/network/MessageBuffer.cc
+++ b/src/mem/ruby/network/MessageBuffer.cc
@@ -44,7 +44,7 @@
     m_max_size(p->buffer_size), m_time_last_time_size_checked(0),
     m_time_last_time_enqueue(0), m_time_last_time_pop(0),
     m_last_arrival_time(0), m_strict_fifo(p->ordered),
-    m_randomization(p->randomization)
+    m_randomization(p->randomization), m_ruby_system(p->ruby_system)
 {
     m_msg_counter = 0;
     m_consumer = NULL;
@@ -151,7 +151,7 @@
     assert(delta > 0);
     Tick arrival_time = 0;
 
-    if (!RubySystem::getRandomization() || !m_randomization) {
+    if (!m_ruby_system->getRandomization() || !m_randomization) {
         // No randomization
         arrival_time = current_time + delta;
     } else {
@@ -178,7 +178,7 @@
     }
 
     // If running a cache trace, don't worry about the last arrival checks
-    if (!RubySystem::getWarmupEnabled()) {
+    if (!m_ruby_system->getWarmupEnabled()) {
         m_last_arrival_time = arrival_time;
     }
 
diff --git a/src/mem/ruby/network/MessageBuffer.py b/src/mem/ruby/network/MessageBuffer.py
--- a/src/mem/ruby/network/MessageBuffer.py
+++ b/src/mem/ruby/network/MessageBuffer.py
@@ -38,6 +38,7 @@
     buffer_size = Param.Unsigned(0, "Maximum number of entries to buffer \
                                      (0 allows infinite entries)")
     randomization = Param.Bool(False, "")
+    ruby_system = Param.RubySystem(Parent.any, "Parent RubySystem object")
 
     master = MasterPort("Master port to MessageBuffer receiver")
     slave = SlavePort("Slave port from MessageBuffer sender")
diff --git a/src/mem/ruby/network/Network.py b/src/mem/ruby/network/Network.py
--- a/src/mem/ruby/network/Network.py
+++ b/src/mem/ruby/network/Network.py
@@ -28,6 +28,7 @@
 #          Brad Beckmann
 
 from m5.params import *
+from m5.proxy import *
 from ClockedObject import ClockedObject
 from BasicLink import BasicLink
 
@@ -47,7 +48,7 @@
 
     block_size_bytes = Param.UInt32(64, "block size used for data messages.")
     control_msg_size = Param.UInt32(8, "")
-    ruby_system = Param.RubySystem("")
+    ruby_system = Param.RubySystem(Parent.any, "Parent RubySystem object")
 
     routers = VectorParam.BasicRouter("Network routers")
     netifs = VectorParam.ClockedObject("Network Interfaces")
diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc
--- a/src/mem/ruby/profiler/Profiler.cc
+++ b/src/mem/ruby/profiler/Profiler.cc
@@ -354,7 +354,7 @@
     for (uint32_t i = 0; i < MachineType_NUM; i++) {
         for (map<uint32_t, AbstractController*>::iterator it =
                 m_ruby_system->m_abstract_controls[i].begin();
-                it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
+             it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
 
             AbstractController *ctr = (*it).second;
             Sequencer *seq = ctr->getCPUSequencer();
@@ -371,7 +371,7 @@
     for (uint32_t i = 0; i < MachineType_NUM; i++) {
         for (map<uint32_t, AbstractController*>::iterator it =
                 m_ruby_system->m_abstract_controls[i].begin();
-                it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
+             it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
 
             AbstractController *ctr = (*it).second;
             Sequencer *seq = ctr->getCPUSequencer();
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -141,6 +141,7 @@
     const NodeID m_version;
     MachineID m_machineID;
     const NodeID m_clusterID;
+    RubySystem *m_ruby_system;
 
     // MasterID used by some components of gem5.
     const MasterID m_masterId;
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -38,6 +38,7 @@
 AbstractController::AbstractController(const Params *p)
     : MemObject(p), Consumer(this), m_version(p->version),
       m_clusterID(p->cluster_id),
+      m_ruby_system(p->ruby_system),
       m_masterId(p->system->getMasterId(name())), m_is_blocking(false),
       m_block_size_bytes(p->block_size_bytes),
       m_block_size_bits(floorLog2(m_block_size_bytes)),
@@ -47,13 +48,12 @@
       memoryPort(csprintf("%s.memory", name()), this, "")
 {
     assert(isPowerOf2(m_block_size_bytes));
-
 }
 
 void
 AbstractController::init()
 {
-    params()->ruby_system->registerAbstractController(this);
+    m_ruby_system->registerAbstractController(this);
     m_delayHistogram.init(10);
     uint32_t size = Network::getNumberOfVirtualNetworks();
     for (uint32_t i = 0; i < size; i++) {
@@ -226,7 +226,7 @@
     pkt->pushSenderState(s);
 
     // Use functional rather than timing accesses during warmup
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         memoryPort.sendFunctional(pkt);
         recvTimingResp(pkt);
         return;
@@ -250,7 +250,7 @@
     pkt->pushSenderState(s);
 
     // Use functional rather than timing accesses during warmup
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         memoryPort.sendFunctional(pkt);
         recvTimingResp(pkt);
         return;
diff --git a/src/mem/ruby/slicc_interface/Controller.py b/src/mem/ruby/slicc_interface/Controller.py
--- a/src/mem/ruby/slicc_interface/Controller.py
+++ b/src/mem/ruby/slicc_interface/Controller.py
@@ -47,7 +47,7 @@
 
     recycle_latency = Param.Cycles(10, "")
     number_of_TBEs = Param.Int(256, "")
-    ruby_system = Param.RubySystem("")
+    ruby_system = Param.RubySystem(Parent.any, "Parent RubySystem object")
 
     memory = MasterPort("Port for attaching a memory controller")
     system = Param.System(Parent.any, "system object parameter")
diff --git a/src/mem/ruby/structures/TimerTable.cc b/src/mem/ruby/structures/TimerTable.cc
--- a/src/mem/ruby/structures/TimerTable.cc
+++ b/src/mem/ruby/structures/TimerTable.cc
@@ -33,7 +33,7 @@
 {
     m_consumer_ptr  = NULL;
     m_next_valid = false;
-    m_next_address = 0;
+    m_next_address = Addr(0);
 }
 
 bool
diff --git a/src/mem/ruby/system/RubySystem.hh b/src/mem/ruby/system/RubySystem.hh
--- a/src/mem/ruby/system/RubySystem.hh
+++ b/src/mem/ruby/system/RubySystem.hh
@@ -71,10 +71,10 @@
     ~RubySystem();
 
     // config accessors
-    static int getRandomization() { return m_randomization; }
-    static uint32_t getMemorySizeBits() { return m_memory_size_bits; }
-    static bool getWarmupEnabled() { return m_warmup_enabled; }
-    static bool getCooldownEnabled() { return m_cooldown_enabled; }
+    int getRandomization() { return m_randomization; }
+    uint32_t getMemorySizeBits() { return m_memory_size_bits; }
+    bool getWarmupEnabled() { return m_warmup_enabled; }
+    bool getCooldownEnabled() { return m_cooldown_enabled; }
 
     SimpleMemory *getPhysMem() { return m_phys_mem; }
     Cycles getStartCycle() { return m_start_cycle; }
@@ -128,14 +128,14 @@
 
   private:
     // configuration parameters
-    static bool m_randomization;
-    const uint32_t m_block_size_bytes;
-    const uint32_t m_block_size_bits;
-    static uint32_t m_memory_size_bits;
+    bool m_randomization;
+    uint32_t m_block_size_bytes;
+    uint32_t m_block_size_bits;
+    uint32_t m_memory_size_bits;
 
-    static bool m_warmup_enabled;
+    bool m_warmup_enabled;
+    bool m_cooldown_enabled;
     static unsigned m_systems_to_warmup;
-    static bool m_cooldown_enabled;
     SimpleMemory *m_phys_mem;
     const bool m_access_backing_store;
 
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -46,13 +46,7 @@
 
 using namespace std;
 
-bool RubySystem::m_randomization;
-uint32_t RubySystem::m_memory_size_bits;
-bool RubySystem::m_warmup_enabled = false;
-// To look forward to allowing multiple RubySystem instances, track the number
-// of RubySystems that need to be warmed up on checkpoint restore.
 unsigned RubySystem::m_systems_to_warmup = 0;
-bool RubySystem::m_cooldown_enabled = false;
 
 RubySystem::RubySystem(const Params *p)
     : ClockedObject(p), m_block_size_bytes(p->block_size_bytes),
@@ -62,6 +56,9 @@
 {
     m_randomization = p->randomization;
 
+    m_warmup_enabled = false;
+    m_cooldown_enabled = false;
+
     assert(isPowerOf2(m_block_size_bytes));
     m_memory_size_bits = p->memory_size_bits;
 
@@ -249,7 +246,7 @@
     // Aggregate the trace entries together into a single array
     uint8_t *raw_data = new uint8_t[4096];
     uint64_t cache_trace_size = m_cache_recorder->aggregateRecords(&raw_data,
-                                                                 4096);
+                                                                   4096);
     string cache_trace_file = name() + ".cache.gz";
     writeCompressedTrace(raw_data, cache_trace_file, cache_trace_size);
 
@@ -388,10 +385,10 @@
 void
 RubySystem::RubyEvent::process()
 {
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         m_ruby_system->m_cache_recorder->enqueueNextFetchRequest(
                 m_ruby_system->m_block_size_bytes);
-    } else if (RubySystem::getCooldownEnabled()) {
+    } else if (m_ruby_system->getCooldownEnabled()) {
         m_ruby_system->m_cache_recorder->enqueueNextFlushRequest();
     }
 }
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -148,6 +148,9 @@
     Stats::Counter getIncompleteTimes(const MachineType t) const
     { return m_IncompleteTimes[t]; }
 
+  protected:
+    RubySystem *m_ruby_system;
+
   private:
     void issueRequest(PacketPtr pkt, RubyRequestType type);
 
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -55,6 +55,7 @@
     : RubyPort(p),
       m_IncompleteTimes(MachineType_NUM), deadlockCheckEvent(this)
 {
+    m_ruby_system = p->ruby_system;
     m_outstanding_count = 0;
 
     m_instCache_ptr = p->icache;
@@ -452,7 +453,7 @@
              total_latency);
 
     // update the data unless it is a non-data-carrying flush
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         data.setData(pkt->getConstPtr<uint8_t>(),
                      offset_address, pkt->getSize());
     } else if (!pkt->isFlush()) {
@@ -487,15 +488,15 @@
 
     delete srequest;
 
-    RubySystem *rs = m_ruby_system;
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         assert(pkt->req);
         delete pkt->req;
         delete pkt;
-        rs->m_cache_recorder->enqueueNextFetchRequest(m_block_size_bytes);
-    } else if (RubySystem::getCooldownEnabled()) {
+        m_ruby_system->m_cache_recorder->\
+            enqueueNextFetchRequest(m_block_size_bytes);
+    } else if (m_ruby_system->getCooldownEnabled()) {
         delete pkt;
-        rs->m_cache_recorder->enqueueNextFlushRequest();
+        m_ruby_system->m_cache_recorder->enqueueNextFlushRequest();
     } else {
         ruby_hit_callback(pkt);
         testDrainComplete();
diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py
--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py
@@ -46,7 +46,7 @@
 
    using_ruby_tester = Param.Bool(False, "")
    no_retry_on_stall = Param.Bool(False, "")
-   ruby_system = Param.RubySystem(Parent.any, "")
+   ruby_system = Param.RubySystem(Parent.any, "Parent RubySystem object")
    system = Param.System(Parent.any, "system object")
    block_size_bytes = Param.UInt32(64, "cache block size")
    support_data_reqs = Param.Bool(True, "data cache requests supported")