| ~ | | Purely to demonstrate this bug, I've added the following code, using the
|
| ~ | | inorder cpu. |
| | ~ | Previously, ClockedObjects would incorrectly update their internal tick after a
|
| | ~ | SrcClockDomain changed the clock period. This would lead to incorrect latency
|
| | + | calculations throughout the system, slowing down simulated runtime by 5-20%. |
| | + |
|
| | + | Purely to investigate this slowdown, I've added the following code, using the
|
| | + | inorder cpu to drive the test. |
| | |
|
| | | In src/sim/stat_control.cc (changed precision from 6 to 12 for more detail):: |
| | |
|
| ~ | | |
| ~ | | .name("sim_seconds") |
| ~ | | .desc("Number of seconds simulated") |
| ~ | | .precision(12) |
| ~ | | ; |
| | ~ | simSeconds
|
| | ~ | .name("sim_seconds")
|
| | ~ | .desc("Number of seconds simulated")
|
| | ~ | .precision(12)
|
| | ~ | ; |
| - | | |
| | |
|
| | | In src/sim/clocked_object.hh (added function for a clocked object to get its clock domain's pointer):: |
| | |
|
| ~ | | inline ClockDomain* getClockDomain() const
|
| ~ | | { |
| ~ | | ClockDomain *c = &clockDomain; |
| ~ | | return c; |
| ~ | | } |
| | ~ | inline ClockDomain getClockDomain() const
|
| | ~ | {
|
| | ~ | ClockDomain c = &clockDomain;
|
| | ~ | return c;
|
| | ~ | } |
| - | | |
| | |
|
| | | In src/cpu/inorder/InOrderCPU.py (change stageWidth to 1 to make dumps easier to read):: |
| | |
|
| ~ | | stageWidth = Param.Unsigned(1, "Stage width")
|
| | ~ | stageWidth = Param.Unsigned(1, "Stage width") |
| - | | |
| | |
|
| | | In src/cpu/inorder/cpu.hh (create a pointer to a SrcClockDomain to control the CPU's clock period from within the CPU):: |
| | |
|
| ~ | | SrcClockDomain *clockDomain;
|
| | ~ | SrcClockDomain *clockDomain; |
| - | | |
| | |
|
| | | In src/cpu/inorder/cpu.cc (let CPU constructor set the clock domain pointer):: |
| | |
|
| ~ | | InOrderCPU::InOrderCPU(Params *params)
|
| ~ | | : (...) |
| ~ | | { |
| ~ | | (...) |
| ~ | | clockDomain = (SrcClockDomain*) getClockDomain(); |
| ~ | | } |
| | ~ | InOrderCPU::InOrderCPU(Params params)
|
| | ~ | : (...)
|
| | ~ | {
|
| | ~ | (...)
|
| | ~ | clockDomain = (SrcClockDomain) getClockDomain();
|
| | ~ | } |
| - | | |
| | |
|
| | | In src/cpu/inorder/cpu.cc (after 5000 cycles, change the frequency to 1999ps):: |
| | |
|
| | | void
|
| ~ | | InOrderCPU::tick()
|
| ~ | | { |
| | ~ | InOrderCPU::tick()
|
| | ~ | { |
| | |
|
| ~ | | |
| | ~ | (...) |
| - | | |
| - | | if (numCycles.value() > 5000) |
| - | | { |
| - | | clockDomain->clockPeriod(1999); |
| - | | } |
| - | | |
| | |
|
| ~ | | } |
| | ~ | if (numCycles.value() > 5000)
|
| | + | {
|
| | + | clockDomain->clockPeriod(1999);
|
| | + | }
|
| | + | } |
| | |
|
| | | After compiling these testing changes, I run MIPS hello world at 500 MHz
|
| ~ | | (2000 ps):: |
| | ~ | (2000 ps):: |
| | |
|
| ~ | | $ ./build/MIPS/gem5.opt configs/example/se.py -c tests/test-progs/hello/bin/mips/linux/hello --cpu-type=inorder --caches --cpu-clock=500MHz --l1d_size=16kB --l1i_size=16kB --l1d_assoc=1 --l1i_assoc=1 --cacheline_size=32 --num-cpus=1
|
| | ~ | $ ./build/MIPS/gem5.opt configs/example/se.py -c tests/test-progs/hello/bin/mips/linux/hello --cpu-type=inorder --caches --cpu-clock=500MHz --l1d_size=16kB --l1i_size=16kB --l1d_assoc=1 --l1i_assoc=1 --cacheline_size=32 --num-cpus=1 |
| - | | |
| | |
|
| | | Results: |
| | |
|
| | | - Without patch, simSeconds is 0.000040804594.
|
| | | - With patch, simSeconds is 0.000038657668.
|
| | |
|
| | | The patch speeds up simulated runtime by 5.6% here. You can check the Exec
|
| ~ | | debug-flag dump to see that memory instructions complete with different
|
| ~ | | latencies with and without the patch. |
| | ~ | debug-flag dump to see that memory instructions complete with different
|
| | ~ | latencies with and without the patch. |
| | |
|
| | | Instructions in middle of simulation, without patch, the time difference is 5997 ps (3
|
| ~ | | cycles):: |
| | ~ | cycles):: |
| | |
|
| ~ | | 10073967: system.cpu T0 : @_int_malloc+992 : srl r9, r21 : IntAlu : D=0x0000000000000000
|
| ~ | | 10079964: system.cpu T0 : @_int_malloc+996 : lw r25, -32740(r28) : MemRead : D=0x0000000010000000 A=0x10004fac |
| | ~ | 10073967: system.cpu T0 : @_int_malloc+992 : srl r9, r21 : IntAlu : D=0x0000000000000000
|
| | ~ | 10079964: system.cpu T0 : @_int_malloc+996 : lw r25, -32740(r28) : MemRead : D=0x0000000010000000 A=0x10004fac |
| - | | |
| | |
|
| | | Same instructions with patch, the time difference is 3998 ps (2 cycles):: |
| | |
|
| ~ | | 10071968: system.cpu T0 : @_int_malloc+992 : srl r9, r21 : IntAlu : D=0x0000000000000000
|
| ~ | | 10075966: system.cpu T0 : @_int_malloc+996 : lw r25, -32740(r28) : MemRead : D=0x0000000010000000 A=0x10004fac |
| | ~ | 10071968: system.cpu T0 : @_int_malloc+992 : srl r9, r21 : IntAlu : D=0x0000000000000000
|
| | ~ | 10075966: system.cpu T0 : @_int_malloc+996 : lw r25, -32740(r28) : MemRead : D=0x0000000010000000 A=0x10004fac |
| - | | |
| | |
|
| | | The correct L1 hit latency is 2 cycles. |
| | |
|
| | | For more information, you can look at src/sim/clocked_object.hh update() to
|
| ~ | | see how the tick is incorrectly updated after a clocked object wakes up. |
| | ~ | see how the tick is incorrectly updated after a clocked object wakes up. |
| | |
|
| | | Here's the relevant section of code in src/sim/clocked_object.hh update():: |
| | |
|
| ~ | | |
| ~ | | { |
| ~ | | (...) |
| ~ | | |
| ~ | | // if not, we have to recalculate the cycle and tick, we |
| ~ | | // perform the calculations in terms of relative cycles to |
| ~ | | // allow changes to the clock period in the future |
| ~ | | Cycles elapsedCycles(divCeil(curTick() - tick, clockPeriod())); |
| ~ | | cycle += elapsedCycles; |
| ~ | | tick += elapsedCycles * clockPeriod(); |
| | ~ | void update() const
|
| | ~ | {
|
| | ~ | (...) |
| | ~ |
|
| | ~ | // if not, we have to recalculate the cycle and tick, we
|
| | ~ | // perform the calculations in terms of relative cycles to |
| | ~ | // allow changes to the clock period in the future |
| | ~ | Cycles elapsedCycles(divCeil(curTick() - tick, clockPeriod())); |
| | ~ | cycle += elapsedCycles; |
| | ~ | tick += elapsedCycles * clockPeriod(); |
| - | | } |
| | | |
| | |
|
| | + | } |
| | + |
|
| | | This part of update() is only executed when a clocked object has been
|
| ~ | | inactive and has not updated its tick for a while. The code updates the tick
|
| ~ | | using clockPeriod() -- basically it's assuming that the clock period has not
|
| ~ | | changed since it went to sleep. |
| | ~ | inactive and has not updated its tick for a while. The code updates the tick
|
| | ~ | using clockPeriod() -- basically it's assuming that the clock period has not
|
| | ~ | changed since it went to sleep. |
| | |
|
| | | In the hello world example, the dcache goes out of sync with the system. You
|
| ~ | | can see this by adding this printout. |
| | ~ | can see this by adding this printout. |
| | |
|
| | | In src/sim/clocked_object.hh:: |
| | |
|
| ~ | | inline Tick clockEdge(Cycles cycles = Cycles(0)) const
|
| ~ | | { |
| ~ | | (update ...) |
| ~ | | |
| ~ | | std::cout << name() << " tick is "; |
| ~ | | std::cout << std::dec << tick << std::endl; |
| | ~ | inline Tick clockEdge(Cycles cycles = Cycles(0)) const
|
| | ~ | {
|
| | ~ | (update ...) |
| | ~ |
|
| | ~ | std::cout << name() << " tick is ";
|
| | ~ | std::cout << std::dec << tick << std::endl; |
| | | |
| ~ | | (return ...) |
| | ~ | (return ...) |
| - | | } |
| | | |
| | |
|
| | + | } |
| | + |
|
| | | If you compile and re-simulate and go to those same two instructions we
|
| ~ | | looked at earlier... |
| | ~ | looked at earlier... |
| | |
|
| | | Without patch:: |
| | |
|
| ~ | | 10073967: system.cpu T0 : @_int_malloc+992 : srl r9, r21 : IntAlu : D=0x0000000000000000
|
| ~ | | system.cpu.dcache tick is 10075945 |
| ~ | | system.cpu.dcache tick is 10075945 |
| ~ | | system.cpu.dcache tick is 10075945 |
| ~ | | system.cpu tick is 10073967 |
| ~ | | system.cpu tick is 10075966 |
| ~ | | system.cpu tick is 10077965 |
| ~ | | 10079964: system.cpu T0 : @_int_malloc+996 : lw r25, -32740(r28) : MemRead : D=0x0000000010000000 A=0x10004fac |
| | ~ | 10073967: system.cpu T0 : @_int_malloc+992 : srl r9, r21 : IntAlu : D=0x0000000000000000
|
| | ~ | system.cpu.dcache tick is 10075945
|
| | ~ | system.cpu.dcache tick is 10075945
|
| | ~ | system.cpu.dcache tick is 10075945
|
| | ~ | system.cpu tick is 10073967
|
| | ~ | system.cpu tick is 10075966
|
| | ~ | system.cpu tick is 10077965
|
| | ~ | 10079964: system.cpu T0 : @_int_malloc+996 : lw r25, -32740(r28) : MemRead : D=0x0000000010000000 A=0x10004fac |
| - | | |
| | |
|
| | | The CPU's lw graduates at tick 10079964. Meanwhile dcache has tick 10075945.
|
| ~ | | This is a difference of 4019, which is not a multiple of the 1999 ps clock
|
| ~ | | period. This mismatch causes the L1 hit latency to increase from 2 cycles to
|
| ~ | | 3 cycles. |
| | ~ | This is a difference of 4019, which is not a multiple of the 1999 ps clock
|
| | ~ | period. This mismatch causes the L1 hit latency to increase from 2 cycles to
|
| | ~ | 3 cycles. |
| | |
|
| | | With patch:: |
| | |
|
| ~ | | 10071968: system.cpu T0 : @_int_malloc+992 : srl r9, r21 : IntAlu : D=0x0000000000000000
|
| ~ | | system.cpu.dcache tick is 10071968 |
| ~ | | system.cpu.dcache tick is 10071968 |
| ~ | | system.cpu.dcache tick is 10071968 |
| ~ | | system.cputick is 10071968 |
| ~ | | system.cputick is 10073967 |
| ~ | | 10075966: system.cpu T0 : @_int_malloc+996 : lw r25, -32740(r28) : MemRead : D=0x0000000010000000 A=0x10004fac |
| | ~ | 10071968: system.cpu T0 : @_int_malloc+992 : srl r9, r21 : IntAlu : D=0x0000000000000000
|
| | ~ | system.cpu.dcache tick is 10071968
|
| | ~ | system.cpu.dcache tick is 10071968
|
| | ~ | system.cpu.dcache tick is 10071968
|
| | ~ | system.cputick is 10071968
|
| | ~ | system.cputick is 10073967
|
| | ~ | 10075966: system.cpu T0 : @_int_malloc+996 : lw r25, -32740(r28) : MemRead : D=0x0000000010000000 A=0x10004fac |
| - | | |
| | |
|
| | | The CPU's lw graduates at tick 10075966. Meanwhile dcache has tick 10071968.
|
| ~ | | This is a difference of 3998, which is exactly 2 cycles. Since the cache and
|
| ~ | | CPU are together, the L1 hit latency stays at 2 cycles even after frequency
|
| ~ | | scaling. |
| | ~ | This is a difference of 3998, which is exactly 2 cycles. Since the cache and
|
| | ~ | CPU are together, the L1 hit latency stays at 2 cycles even after frequency
|
| | ~ | scaling. |