diff -r 97afdc397b89 -r 40fa2d3fc0c8 configs/common/CacheConfig.py --- a/configs/common/CacheConfig.py Thu Apr 21 20:01:28 2011 -0700 +++ b/configs/common/CacheConfig.py Fri Apr 22 17:11:35 2011 -0700 @@ -40,7 +40,19 @@ system.tol2bus = Bus() system.l2.cpu_side = system.tol2bus.port system.l2.mem_side = system.membus.port - system.l2.num_cpus = options.num_cpus + + ## This assumes the L2 is shared amongst all cpus (and their threads + ## in the system. If you want per HW thread miss stats, you need + ## to set the cache param num_sharing_contexts to the appropriate value. + ## FYI: on how stats would be calculated: + ## C0T0 = context 0, C0T1 = context 1, C1T0 = context 2, C1T1 =context3 + ## Stats would then be divvied through % of number of contexts + ## If this value is not set for a shared cache, all stats will just be + ## aggregated together. + num_contexts = options.num_cpus * system.cpu[0].numThreads # if SMT + if buildEnv['FULL_SYSTEM']: + num_contexts = num_contexts + 1 ## for device accesses + system.l2.num_sharing_contexts = num_contexts for i in xrange(options.num_cpus): if options.caches: diff -r 97afdc397b89 -r 40fa2d3fc0c8 src/cpu/base.cc --- a/src/cpu/base.cc Thu Apr 21 20:01:28 2011 -0700 +++ b/src/cpu/base.cc Fri Apr 22 17:11:35 2011 -0700 @@ -311,14 +311,15 @@ /** This is so that contextId and cpuId match where there is a * 1cpu:1context relationship. Otherwise, the order of registration * could affect the assignment and cpu 1 could have context id 3, for - * example. We may even want to do something like this for SMT so that - * cpu 0 has the lowest thread contexts and cpu N has the highest, but - * I'll just do this for now - */ + * example. Trying to do the same thing for SMT such that cpu2 context1 + * has contextId = 5, assuming 2-way SMT and that all cores are + * homogeneous in thread width. + **/ if (numThreads == 1) tc->setContextId(system->registerThreadContext(tc, _cpuId)); else - tc->setContextId(system->registerThreadContext(tc)); + tc->setContextId(system->registerThreadContext(tc, + _cpuId*size + tid)); #if !FULL_SYSTEM tc->getProcessPtr()->assignThreadContext(tc->contextId()); #endif diff -r 97afdc397b89 -r 40fa2d3fc0c8 src/mem/cache/BaseCache.py --- a/src/mem/cache/BaseCache.py Thu Apr 21 20:01:28 2011 -0700 +++ b/src/mem/cache/BaseCache.py Fri Apr 22 17:11:35 2011 -0700 @@ -44,7 +44,7 @@ prioritizeRequests = Param.Bool(False, "always service demand misses first") repl = Param.Repl(NULL, "replacement policy") - num_cpus = Param.Int(1, "number of cpus sharing this cache") + num_sharing_contexts = Param.Int(1, "number of contexts sharing this cache") size = Param.MemorySize("capacity in bytes") forward_snoops = Param.Bool(True, "forward snoops from mem side to cpu side") diff -r 97afdc397b89 -r 40fa2d3fc0c8 src/mem/cache/base.hh --- a/src/mem/cache/base.hh Thu Apr 21 20:01:28 2011 -0700 +++ b/src/mem/cache/base.hh Fri Apr 22 17:11:35 2011 -0700 @@ -228,10 +228,10 @@ Range addrRange; /** number of cpus sharing this cache - from config file */ - int _numCpus; + int _numSharingContexts; public: - int numCpus() { return _numCpus; } + int numSharers() { return _numSharingContexts; } // Statistics /** * @addtogroup CacheStatistics @@ -495,25 +495,32 @@ virtual bool inMissQueue(Addr addr) = 0; + // generic macro for incrementing thread-based vector stats + // if id is -1, then it's from a device and put it in a special + // bin for devices, which is only instantiated in FS sims + // otherwise, mod it to the right bin (if there are switch_cpus in the + // system, then you need the mod to get the right bin. +#define incThreadVectorStat(pkt, stat) \ + if (pkt->req->hasContextId()) \ + stat[pkt->cmdToIndex()][pkt->req->contextId() % _numSharingContexts]++; \ + else { \ + assert(pkt->req->contextId() == -1); \ + assert(FULL_SYSTEM); \ + stat[pkt->cmdToIndex()][_numSharingContexts]++; \ + } + +#define addThreadVectorStat(pkt, stat, amt) \ + if (pkt->req->hasContextId()) \ + stat[pkt->cmdToIndex()][pkt->req->contextId() % _numSharingContexts] += amt; \ + else { \ + assert(pkt->req->contextId() == -1); \ + assert(FULL_SYSTEM); \ + stat[pkt->cmdToIndex()][_numSharingContexts] += amt; \ + } + void incMissCount(PacketPtr pkt, int id) { - - if (pkt->cmd == MemCmd::Writeback) { - assert(id == -1); - misses[pkt->cmdToIndex()][0]++; - /* same thing for writeback hits as misses - no context id - * available, meanwhile writeback hit/miss stats are not used - * in any aggregate hit/miss calculations, so just lump them all - * in bucket 0 */ -#if FULL_SYSTEM - } else if (id == -1) { - // Device accesses have id -1 - // lump device accesses into their own bucket - misses[pkt->cmdToIndex()][_numCpus]++; -#endif - } else { - misses[pkt->cmdToIndex()][id % _numCpus]++; - } + incThreadVectorStat(pkt, misses); if (missCount) { --missCount; @@ -521,29 +528,6 @@ exitSimLoop("A cache reached the maximum miss count"); } } - void incHitCount(PacketPtr pkt, int id) - { - - /* Writeback requests don't have a context id associated with - * them, so attributing a hit to a -1 context id is obviously a - * problem. I've noticed in the stats that hits are split into - * demand and non-demand hits - neither of which include writeback - * hits, so here, I'll just put the writeback hits into bucket 0 - * since it won't mess with any other stats -hsul */ - if (pkt->cmd == MemCmd::Writeback) { - assert(id == -1); - hits[pkt->cmdToIndex()][0]++; -#if FULL_SYSTEM - } else if (id == -1) { - // Device accesses have id -1 - // lump device accesses into their own bucket - hits[pkt->cmdToIndex()][_numCpus]++; -#endif - } else { - /* the % is necessary in case there are switch cpus */ - hits[pkt->cmdToIndex()][id % _numCpus]++; - } - } }; diff -r 97afdc397b89 -r 40fa2d3fc0c8 src/mem/cache/base.cc --- a/src/mem/cache/base.cc Thu Apr 21 20:01:28 2011 -0700 +++ b/src/mem/cache/base.cc Fri Apr 22 17:11:35 2011 -0700 @@ -65,7 +65,7 @@ missCount(p->max_miss_count), drainEvent(NULL), addrRange(p->addr_range), - _numCpus(p->num_cpus) + _numSharingContexts(p->num_sharing_contexts) { } @@ -151,11 +151,7 @@ const string &cstr = cmd.toString(); hits[access_idx] -#if FULL_SYSTEM - .init(_numCpus + 1) -#else - .init(_numCpus) -#endif + .init(_numSharingContexts) .name(name() + "." + cstr + "_hits") .desc("number of " + cstr + " hits") .flags(total | nozero | nonan) @@ -192,11 +188,7 @@ const string &cstr = cmd.toString(); misses[access_idx] -#if FULL_SYSTEM - .init(_numCpus + 1) -#else - .init(_numCpus) -#endif + .init(_numSharingContexts) .name(name() + "." + cstr + "_misses") .desc("number of " + cstr + " misses") .flags(total | nozero | nonan) @@ -223,7 +215,7 @@ const string &cstr = cmd.toString(); missLatency[access_idx] - .init(maxThreadsPerCPU) + .init(_numSharingContexts) .name(name() + "." + cstr + "_miss_latency") .desc("number of " + cstr + " miss cycles") .flags(total | nozero | nonan) @@ -366,7 +358,7 @@ ; writebacks - .init(maxThreadsPerCPU) + .init(_numSharingContexts) .name(name() + ".writebacks") .desc("number of writebacks") .flags(total) @@ -379,7 +371,7 @@ const string &cstr = cmd.toString(); mshr_hits[access_idx] - .init(maxThreadsPerCPU) + .init(_numSharingContexts) .name(name() + "." + cstr + "_mshr_hits") .desc("number of " + cstr + " MSHR hits") .flags(total | nozero | nonan) @@ -406,7 +398,7 @@ const string &cstr = cmd.toString(); mshr_misses[access_idx] - .init(maxThreadsPerCPU) + .init(_numSharingContexts) .name(name() + "." + cstr + "_mshr_misses") .desc("number of " + cstr + " MSHR misses") .flags(total | nozero | nonan) @@ -433,7 +425,7 @@ const string &cstr = cmd.toString(); mshr_miss_latency[access_idx] - .init(maxThreadsPerCPU) + .init(_numSharingContexts) .name(name() + "." + cstr + "_mshr_miss_latency") .desc("number of " + cstr + " MSHR miss cycles") .flags(total | nozero | nonan) @@ -461,7 +453,7 @@ const string &cstr = cmd.toString(); mshr_uncacheable[access_idx] - .init(maxThreadsPerCPU) + .init(_numSharingContexts) .name(name() + "." + cstr + "_mshr_uncacheable") .desc("number of " + cstr + " MSHR uncacheable") .flags(total | nozero | nonan) @@ -482,7 +474,7 @@ const string &cstr = cmd.toString(); mshr_uncacheable_lat[access_idx] - .init(maxThreadsPerCPU) + .init(_numSharingContexts) .name(name() + "." + cstr + "_mshr_uncacheable_latency") .desc("number of " + cstr + " MSHR uncacheable cycles") .flags(total | nozero | nonan) @@ -611,7 +603,7 @@ overallAvgMshrUncacheableLatency = overallMshrUncacheableLatency / overallMshrUncacheable; mshr_cap_events - .init(maxThreadsPerCPU) + .init(_numSharingContexts) .name(name() + ".mshr_cap_events") .desc("number of times MSHR cap was activated") .flags(total) @@ -619,7 +611,7 @@ //software prefetching stats soft_prefetch_mshr_full - .init(maxThreadsPerCPU) + .init(_numSharingContexts) .name(name() + ".soft_prefetch_mshr_full") .desc("number of mshr full events for SW prefetching instrutions") .flags(total) diff -r 97afdc397b89 -r 40fa2d3fc0c8 src/mem/cache/blk.hh --- a/src/mem/cache/blk.hh Thu Apr 21 20:01:28 2011 -0700 +++ b/src/mem/cache/blk.hh Fri Apr 22 17:11:35 2011 -0700 @@ -153,6 +153,7 @@ whenReady = rhs.whenReady; set = rhs.set; refCount = rhs.refCount; + contextSrc = rhs.contextSrc; return *this; } diff -r 97afdc397b89 -r 40fa2d3fc0c8 src/mem/cache/cache_impl.hh --- a/src/mem/cache/cache_impl.hh Thu Apr 21 20:01:28 2011 -0700 +++ b/src/mem/cache/cache_impl.hh Fri Apr 22 17:11:35 2011 -0700 @@ -333,7 +333,7 @@ if (pkt->needsExclusive() ? blk->isWritable() : blk->isReadable()) { // OK to satisfy access - incHitCount(pkt, id); + incThreadVectorStat(pkt, hits); satisfyCpuSideRequest(pkt, blk); return true; } @@ -367,7 +367,7 @@ } // nothing else to do; writeback doesn't expect response assert(!pkt->needsResponse()); - incHitCount(pkt, id); + incThreadVectorStat(pkt, hits); return true; } @@ -522,10 +522,11 @@ Addr blk_addr = blockAlign(pkt->getAddr()); MSHR *mshr = mshrQueue.findMatch(blk_addr); + if (mshr) { // MSHR hit //@todo remove hw_pf here - mshr_hits[pkt->cmdToIndex()][0/*pkt->req->threadId()*/]++; + incThreadVectorStat(pkt, mshr_hits); if (mshr->threadNum != 0/*pkt->req->threadId()*/) { mshr->threadNum = -1; } @@ -540,7 +541,7 @@ } } else { // no MSHR - mshr_misses[pkt->cmdToIndex()][0/*pkt->req->threadId()*/]++; + incThreadVectorStat(pkt, mshr_misses); // always mark as cache fill for now... if we implement // no-write-allocate or bypass accesses this will have to // be changed. @@ -849,16 +850,13 @@ // Initial target is used just for stats MSHR::Target *initial_tgt = mshr->getTarget(); BlkType *blk = tags->findBlock(pkt->getAddr()); - int stats_cmd_idx = initial_tgt->pkt->cmdToIndex(); Tick miss_latency = curTick() - initial_tgt->recvTime; PacketList writebacks; if (pkt->req->isUncacheable()) { - mshr_uncacheable_lat[stats_cmd_idx][0/*pkt->req->threadId()*/] += - miss_latency; + addThreadVectorStat(initial_tgt->pkt, mshr_uncacheable_lat, miss_latency); } else { - mshr_miss_latency[stats_cmd_idx][0/*pkt->req->threadId()*/] += - miss_latency; + addThreadVectorStat(initial_tgt->pkt, mshr_miss_latency, miss_latency); } bool is_fill = !mshr->isForward && @@ -903,8 +901,8 @@ (transfer_offset ? pkt->finishTime : pkt->firstWordTime); assert(!target->pkt->req->isUncacheable()); - missLatency[target->pkt->cmdToIndex()][0/*pkt->req->threadId()*/] += - completion_time - target->recvTime; + addThreadVectorStat(target->pkt, missLatency, (completion_time - + target->recvTime)); } else if (pkt->cmd == MemCmd::UpgradeFailResp) { // failed StoreCond upgrade assert(target->pkt->cmd == MemCmd::StoreCondReq || @@ -1008,10 +1006,16 @@ { assert(blk && blk->isValid() && blk->isDirty()); - writebacks[0/*pkt->req->threadId()*/]++; + int stat_index; + if (blk->contextSrc == -1) + stat_index = _numSharingContexts; + else + stat_index = blk->contextSrc % _numSharingContexts; + writebacks[stat_index]; Request *writebackReq = new Request(tags->regenerateBlkAddr(blk->tag, blk->set), blkSize, 0); + writebackReq->setThreadContext(blk->contextSrc, 0 /*don't know tid*/); PacketPtr writeback = new Packet(writebackReq, MemCmd::Writeback, -1); if (blk->isWritable()) { writeback->setSupplyExclusive(); @@ -1077,6 +1081,7 @@ assert(pkt->hasData()); // need to do a replacement blk = allocateBlock(addr, writebacks); + int id = pkt->req->hasContextId() ? pkt->req->contextId() : -1; if (blk == NULL) { // No replaceable block... just use temporary storage to // complete the current request and then get rid of it @@ -1084,9 +1089,9 @@ blk = tempBlock; tempBlock->set = tags->extractSet(addr); tempBlock->tag = tags->extractTag(addr); + tempBlock->contextSrc = id; DPRINTF(Cache, "using temp block for %x\n", addr); } else { - int id = pkt->req->hasContextId() ? pkt->req->contextId() : -1; tags->insertBlock(pkt->getAddr(), blk, id); } @@ -1430,7 +1435,13 @@ if (!tags->findBlock(pf_addr) && !mshrQueue.findMatch(pf_addr)) { // Update statistic on number of prefetches issued // (hwpf_mshr_misses) - mshr_misses[pkt->cmdToIndex()][0/*pkt->req->threadId()*/]++; + int stat_index; + if (pkt->req->hasContextId()) { + stat_index = pkt->req->contextId() % _numSharingContexts; + } else { + stat_index = _numSharingContexts; + } + mshr_misses[pkt->cmdToIndex()][stat_index]++; // Don't request bus, since we already have it return allocateMissBuffer(pkt, curTick(), false); } diff -r 97afdc397b89 -r 40fa2d3fc0c8 src/mem/cache/tags/base.cc --- a/src/mem/cache/tags/base.cc Thu Apr 21 20:01:28 2011 -0700 +++ b/src/mem/cache/tags/base.cc Fri Apr 22 17:11:35 2011 -0700 @@ -87,7 +87,7 @@ ; occupancies - .init(cache->numCpus() + 1) + .init(cache->numSharers() + 1) // +1 is for l1 writebacks .name(name + ".occ_blocks") .desc("Average occupied blocks per context") .flags(nozero | nonan) diff -r 97afdc397b89 -r 40fa2d3fc0c8 src/mem/cache/tags/lru.cc --- a/src/mem/cache/tags/lru.cc Thu Apr 21 20:01:28 2011 -0700 +++ b/src/mem/cache/tags/lru.cc Fri Apr 22 17:11:35 2011 -0700 @@ -161,10 +161,10 @@ // deal with evicted block if (blk->contextSrc != -1) { - occupancies[blk->contextSrc % cache->numCpus()]--; - blk->contextSrc = -1; + occupancies[blk->contextSrc % cache->numSharers()]--; +// blk->contextSrc = -1; } else { - occupancies[cache->numCpus()]--; + occupancies[cache->numSharers()]--; } DPRINTF(CacheRepl, "set %x: selecting blk %x for replacement\n", @@ -190,9 +190,9 @@ // deal with what we are bringing in if (context_src != -1) { - occupancies[context_src % cache->numCpus()]++; + occupancies[context_src % cache->numSharers()]++; } else { - occupancies[cache->numCpus()]++; + occupancies[cache->numSharers()]++; } blk->contextSrc = context_src; @@ -209,10 +209,10 @@ blk->clearLoadLocks(); tagsInUse--; if (blk->contextSrc != -1) { - occupancies[blk->contextSrc % cache->numCpus()]--; + occupancies[blk->contextSrc % cache->numSharers()]--; blk->contextSrc = -1; } else { - occupancies[cache->numCpus()]--; + occupancies[cache->numSharers()]--; } } }