diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -150,6 +150,7 @@ IsUncacheable, ReqMade, MemOpDone, + CacheAccessDelayed, MaxFlags }; @@ -238,8 +239,9 @@ /////////////////////// TLB Miss ////////////////////// /** - * Saved memory requests (needed when the DTB address translation is - * delayed due to a hw page table walk). + * Saved memory requests (needed when (1) the DTB address translation is + * delayed due to a hw page table walk (2) load cannot be executed due + * to lack of cache ports). */ RequestPtr savedReq; RequestPtr savedSreqLow; @@ -357,6 +359,13 @@ return (translationStarted() && !translationCompleted()); } + /** True if the instruction was delayed because of lack of cache + * ports while sitting in the LSQ. If this is true, the instruction will + * be tried to execute again. + */ + bool cacheAccessDelayed() const { return instFlags[CacheAccessDelayed]; } + void cacheAccessDelayed(bool f) { instFlags[CacheAccessDelayed] = f; } + public: #ifdef DEBUG void dumpSNList(); diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py --- a/src/cpu/o3/O3CPU.py +++ b/src/cpu/o3/O3CPU.py @@ -39,7 +39,9 @@ activity = Param.Unsigned(0, "Initial count") - cachePorts = Param.Unsigned(200, "Cache Ports") + cacheReadPorts = Param.Unsigned(1, "Cache dedicated read ports") + cacheSharedPorts = Param.Unsigned(0, "Cache shared read/write ports") + cacheWritePorts = Param.Unsigned(1, "Cache dedicated write ports") decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay") renameToFetchDelay = Param.Cycles(1 ,"Rename to fetch delay") diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -1265,6 +1265,15 @@ continue; } + //to make sure we don't load more than the available cache read ports + if (inst->cacheAccessDelayed() && fault == NoFault) { + // Not enough cache read port now, we should wait + DPRINTF(IEW, "Execute: Delayed cache Access, deferring " + "load.\n"); + instQueue.deferMemAccessInst(inst); + continue; + } + if (inst->isDataPrefetch() || inst->isInstPrefetch()) { inst->fault = NoFault; } @@ -1482,6 +1491,9 @@ void DefaultIEW::tick() { + //To reset some lsq counter + ldstQueue.tick(); + wbNumInst = 0; wbCycle = 0; @@ -1507,6 +1519,9 @@ } if (exeStatus != Squashing) { + //load remaining split loads + ldstQueue.readPendingSplitLoad(); + executeInsts(); writebackInsts(); diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -192,6 +192,11 @@ */ DynInstPtr getDeferredMemInstToExecute(); + /** Returns a memory instruction that was deferred due to the lack of free + * ports. + */ + DynInstPtr getDeferredMemAccessInstToExecute(); + /** * Records the instruction as the producer of a register without * adding it to the rest of the IQ. @@ -241,6 +246,11 @@ */ void deferMemInst(DynInstPtr &deferred_inst); + /** + * Defers a memory instruction when there is lack of free cache ports. + */ + void deferMemAccessInst(DynInstPtr &deferred_inst); + /** Indicates an ordering violation between a store and a load. */ void violation(DynInstPtr &store, DynInstPtr &faulting_load); @@ -307,6 +317,11 @@ */ std::list deferredMemInsts; + /** List of instructions waiting for free cache ports. Their access to + * cache was denied at least one time for lack of free cache ports. + */ + std::list deferredMemAccessInsts; + /** * Struct for comparing entries to be added to the priority queue. * This gives reverse ordering to the instructions in terms of diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh --- a/src/cpu/o3/inst_queue_impl.hh +++ b/src/cpu/o3/inst_queue_impl.hh @@ -410,6 +410,7 @@ nonSpecInsts.clear(); listOrder.clear(); deferredMemInsts.clear(); + deferredMemAccessInsts.clear(); } template @@ -740,6 +741,15 @@ total_deferred_mem_issued++; } + //for adding mem instructions that were denied to access cache because of lack of free ports + //Should it be done before getDeferredMemInstToExecute()? + while (total_deferred_mem_issued < totalWidth && + (deferred_mem_inst = getDeferredMemAccessInstToExecute()) != 0) { + issueToExecuteQueue->access(0)->size++; + instsToExecute.push_back(deferred_mem_inst); + total_deferred_mem_issued++; + } + // Have iterator to head of the list // While I haven't exceeded bandwidth or reached the end of the list, // Try to get a FU that can do what this op needs. @@ -873,7 +883,8 @@ // @todo If the way deferred memory instructions are handeled due to // translation changes then the deferredMemInsts condition should be removed // from the code below. - if (total_issued || total_deferred_mem_issued || deferredMemInsts.size()) { + if (total_issued || total_deferred_mem_issued || deferredMemInsts.size() + || deferredMemAccessInsts.size()) { cpu->activityThisCycle(); } else { DPRINTF(IQ, "Not able to schedule any instructions.\n"); @@ -1093,6 +1104,26 @@ template void +InstructionQueue::deferMemAccessInst(DynInstPtr &deferred_inst) +{ + deferredMemAccessInsts.push_back(deferred_inst); +} + +template +typename Impl::DynInstPtr +InstructionQueue::getDeferredMemAccessInstToExecute() +{ + for (ListIt it = deferredMemAccessInsts.begin(); it != deferredMemAccessInsts.end(); + ++it) { + DynInstPtr ret = *it; + deferredMemAccessInsts.erase(it); + return ret; + } + return NULL; +} + +template +void InstructionQueue::violation(DynInstPtr &store, DynInstPtr &faulting_load) { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -133,6 +133,11 @@ void writebackStores(ThreadID tid); /** + * Attempts to read a pending split load. + */ + void readPendingSplitLoad(); + + /** * Squash instructions from a thread until the specified sequence number. */ void squash(const InstSeqNum &squashed_num, ThreadID tid) @@ -311,6 +316,47 @@ /** The IEW stage pointer. */ IEW *iewStage; + /** The number of used cache read ports in this cycle. */ + int usedReadPorts; + + /** The number of used shared read/write ports in this cycle. */ + int usedSharedPorts; + + /** The number of used write ports in this cycle. */ + int usedWritePorts; + + /** The number of dedicated cache read ports available each cycle. */ + int cacheReadPorts; + + /** The number of shared cache read/write ports available each cycle. */ + int cacheSharedPorts; + + /** The number of dedicated cache write ports available each cycle. */ + int cacheWritePorts; + + /** Whether or not there is a split load packet (second packet) that + * could not be sent because of a lack of cache read request ports. */ + bool hasPendingSplitPkt; + + /** Whether or not there is a free cache read port. This checks both + * dedicated read ports and shared read/write ports, if any. */ + bool freeCacheReadPorts(); + + /** Increment the number of used read ports with respect to dedeicated + * and shared ports. */ + void incCacheReadPorts(); + + /** Whether or not there is a free cache write port. This checks both + * dedicated write ports and shared read/write ports, if any. */ + bool freeCacheWritePorts(); + + /** Increment the number of used write ports with respect to dedeicated + * and shared ports. */ + void incCacheWritePorts(); + + /** The thread Id of the split load that pending free cache read request ports. */ + ThreadID pendingSplitThreadId; + protected: /** The LSQ policy for SMT mode. */ LSQPolicy lsqPolicy; diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -56,6 +56,9 @@ template LSQ::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params) : cpu(cpu_ptr), iewStage(iew_ptr), + cacheReadPorts(params->cacheReadPorts), + cacheSharedPorts(params->cacheSharedPorts), + cacheWritePorts(params->cacheWritePorts), LQEntries(params->LQEntries), SQEntries(params->SQEntries), numThreads(params->numThreads), @@ -114,6 +117,13 @@ maxLQEntries, maxSQEntries, tid); thread[tid].setDcachePort(&cpu_ptr->getDataPort()); } + + //Reset Counters + usedReadPorts = 0; + usedSharedPorts = 0; + usedWritePorts = 0; + hasPendingSplitPkt = false; + pendingSplitThreadId = -1; } @@ -244,6 +254,12 @@ void LSQ::tick() { + //Reseting counter + //DPRINTF(LSQ, "LSQ tick\n"); + usedReadPorts = 0; + usedSharedPorts = 0; + usedWritePorts = 0; + list::iterator threads = activeThreads->begin(); list::iterator end = activeThreads->end(); @@ -292,6 +308,60 @@ template void +LSQ::readPendingSplitLoad() +{ + if (hasPendingSplitPkt) { + assert(pendingSplitThreadId >= 0); + thread[pendingSplitThreadId].readPendingSplitLoad(); + hasPendingSplitPkt = false; + pendingSplitThreadId = -1; + } +} + +template +bool +LSQ::freeCacheReadPorts() +{ + return ((usedReadPorts < cacheReadPorts) || + (usedSharedPorts < cacheSharedPorts)); +} + +template +void +LSQ::incCacheReadPorts() +{ + assert(freeCacheReadPorts()); + if (usedReadPorts < cacheReadPorts) + ++usedReadPorts; + else { //use shared ports + assert(usedSharedPorts < cacheSharedPorts); + ++usedSharedPorts; + } +} + +template +bool +LSQ::freeCacheWritePorts() +{ + return ((usedWritePorts < cacheWritePorts) || + (usedSharedPorts < cacheSharedPorts)); +} + +template +void +LSQ::incCacheWritePorts() +{ + assert(freeCacheWritePorts()); + if (usedWritePorts < cacheWritePorts) + ++usedWritePorts; + else { //use shared ports + assert(usedSharedPorts < cacheSharedPorts); + ++usedSharedPorts; + } +} + +template +void LSQ::writebackStores() { list::iterator threads = activeThreads->begin(); @@ -347,9 +417,16 @@ bool LSQ::recvTimingResp(PacketPtr pkt) { + assert(pkt->isResponse()); + + DPRINTF(LSQ, "received pkt for addr:%#x %s, isLLSC: %d, isRead: %d, " + "isWrite: %d, isReadWrite: %d\n", pkt->getAddr(), pkt->cmdString(), + pkt->isLLSC(), pkt->isRead(), pkt->isWrite(), pkt->isReadWrite()); + if (pkt->isError()) DPRINTF(LSQ, "Got error packet back for address: %#X\n", pkt->getAddr()); + thread[pkt->req->threadId()].completeDataAccess(pkt); return true; } @@ -358,8 +435,10 @@ void LSQ::recvTimingSnoopReq(PacketPtr pkt) { - DPRINTF(LSQ, "received pkt for addr:%#x %s\n", pkt->getAddr(), - pkt->cmdString()); + assert(pkt->isRequest()); + DPRINTF(LSQ, "received snoop pkt for addr:%#x %s, isLLSC: %d, isRead: %d, " + "isWrite: %d, isReadWrite: %d\n", pkt->getAddr(), pkt->cmdString(), + pkt->isLLSC(), pkt->isRead(), pkt->isWrite(), pkt->isReadWrite()); // must be a snoop if (pkt->isInvalidate()) { diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -61,6 +61,7 @@ #include "mem/packet.hh" #include "mem/port.hh" #include "sim/fault_fwd.hh" +#include "sim/eventq.hh" struct DerivO3CPUParams; @@ -114,7 +115,7 @@ * @todo: Move the number of used ports up to the LSQ level so it can * be shared by all LSQ units. */ - void tick() { usedPorts = 0; } + void tick() { } /** Inserts an instruction. */ void insert(DynInstPtr &inst); @@ -486,6 +487,13 @@ /** The packet that is pending free cache ports. */ PacketPtr pendingPkt; + /** The split load packet that pending free cache read request ports. */ + PacketPtr pendingSplitPkt; + + /** The load state of pending split packet that pending free cache read + * request ports. */ + LSQSenderState *pendingSplitState; + /** Flag for memory model. */ bool needsTSO; @@ -523,6 +531,14 @@ /** Number of times the LSQ is blocked due to the cache. */ Stats::Scalar lsqCacheBlocked; + /** Number of times the load request execution in the LSQ is denied due to + * lack of free read cache ports. */ + Stats::Scalar cacheReadReqPortBlocked; + + /** Number of times the store request execution in the LSQ is denied due to + * lack of free write cache ports. */ + Stats::Scalar cacheWriteReqPortBlocked; + public: /** Executes the load at the given index. */ Fault read(Request *req, Request *sreqLow, Request *sreqHigh, @@ -560,6 +576,11 @@ /** Returns whether or not the LSQ unit is stalled. */ bool isStalled() { return stalled; } + + /** Does a read for a split load that couldn't be completed + * the previous cycle. */ + void readPendingSplitLoad(); + }; template @@ -646,6 +667,9 @@ delete fst_data_pkt; delete snd_data_pkt; } + DPRINTF(LSQUnit, "Scheduling writeback event for load [sn:%lli] " + "PC %s due to MmappedIpr\n", + load_inst->seqNum, load_inst->pcState()); WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this); cpu->schedule(wb, cpu->clockEdge(delay)); return NoFault; @@ -707,7 +731,14 @@ PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq); data_pkt->dataStatic(load_inst->memData); + + DPRINTF(LSQUnit, "Scheduling writeback event for load [sn:%lli] " + "PC %s due to store to load forwarding\n", + load_inst->seqNum, load_inst->pcState()); + WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this); + //To disable cacheAccessDelayed + load_inst->cacheAccessDelayed(false); // We'll say this has a 1 cycle load-store forwarding latency // for now. @@ -751,6 +782,8 @@ iewStage->rescheduleMemInst(load_inst); iewStage->decrWb(load_inst->seqNum); load_inst->clearIssued(); + //Disable cache access delayed + load_inst->cacheAccessDelayed(false); ++lsqRescheduledLoads; // Do not generate a writeback event as this instruction is not @@ -773,14 +806,30 @@ } // If there's no forwarding case, then go access memory + + /*Checking for free read ports before accessing memory */ + if (!lsq->freeCacheReadPorts()) { + ++cacheReadReqPortBlocked; + DPRINTF(LSQUnit, "no free cache read ports to send load requests.\n"); + //NOTE: this instr does not lookup TLB any more, because + //translationStarted and translationCompleted are already set. + + load_inst->cacheAccessDelayed(true); + load_inst->savedReq = req; + load_inst->savedSreqLow = sreqLow; + load_inst->savedSreqHigh = sreqHigh; + return NoFault; + } + + load_inst->cacheAccessDelayed(false); + /*End checking */ + DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n", load_inst->seqNum, load_inst->pcState()); assert(!load_inst->memData); load_inst->memData = new uint8_t[64]; - ++usedPorts; - // if we the cache is not blocked, do cache access bool completedFirst = false; if (!lsq->cacheBlocked()) { @@ -819,7 +868,13 @@ state->mainPkt = data_pkt; } + DPRINTF(LSQUnit, "sendTiming pkt for addr:%#x\n", + fst_data_pkt->getAddr()); + lsq->incCacheReadPorts(); + if (!dcachePort->sendTimingReq(fst_data_pkt)) { + DPRINTF(LSQUnit, "sendTiming pkt for addr:%#x failed!\n", + fst_data_pkt->getAddr()); // Delete state and data packet because a load retry // initiates a pipeline restart; it does not retry. delete state; @@ -847,19 +902,39 @@ // load will be squashed, so indicate this to the state object. // The first packet will return in completeDataAccess and be // handled there. - ++usedPorts; - if (!dcachePort->sendTimingReq(snd_data_pkt)) { - // The main packet will be deleted in completeDataAccess. - delete snd_data_pkt->req; - delete snd_data_pkt; + DPRINTF(LSQUnit, "sendTiming pkt for second addr (split):%#x\n", + snd_data_pkt->getAddr()); + if (lsq->freeCacheReadPorts()) { + lsq->incCacheReadPorts(); + if (!dcachePort->sendTimingReq(snd_data_pkt)) { - state->complete(); + DPRINTF(LSQUnit, "sendTiming pkt for second addr (split):%#x failed!\n", snd_data_pkt->getAddr()); + // The main packet will be deleted in completeDataAccess. + delete snd_data_pkt->req; + delete snd_data_pkt; - req = NULL; - sreqHigh = NULL; + state->complete(); - lsq->setRetryTid(lsqID); + req = NULL; + sreqHigh = NULL; + + lsq->setRetryTid(lsqID); + } + } else { + //no free cache read ports + ++cacheReadReqPortBlocked; + + //We save the packet to be sent next cycle + //There should be only one pending split packet for all + //threads, since once cache is blocked for one thread, all + //other threads should not be able to do more loads. + assert(!lsq->cacheBlocked()); + assert(pendingSplitPkt == NULL); + pendingSplitPkt = snd_data_pkt; + pendingSplitState = state; + lsq->hasPendingSplitPkt = true; + lsq->pendingSplitThreadId = load_inst->threadNumber; } } } diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -107,6 +107,9 @@ iewStage->decrWb(inst->seqNum); } else { if (!state->noWB) { + DPRINTF(LSQUnit, "Complete DataAccess for load: %d, store: %d, " + "PC %s, [sn:%lli]\n", inst->isLoad(), inst->isStore(), + inst->pcState(), inst->seqNum); if (!TheISA::HasUnalignedMemAcc || !state->isSplit || !state->isLoad) { writeback(inst, pkt); @@ -116,6 +119,8 @@ } if (inst->isStore()) { + DPRINTF(LSQUnit, "Complete DataAccess for store PC %s, [sn:%lli]\n", + inst->pcState(), inst->seqNum); completeStore(state->idx); } } @@ -133,7 +138,8 @@ LSQUnit::LSQUnit() : loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false), isStoreBlocked(false), isLoadBlocked(false), - loadBlockedHandled(false), storeInFlight(false), hasPendingPkt(false) + loadBlockedHandled(false), storeInFlight(false), hasPendingPkt(false), + pendingSplitPkt(NULL) { } @@ -161,7 +167,6 @@ depCheckShift = params->LSQDepCheckShift; checkLoads = params->LSQCheckLoads; - cachePorts = params->cachePorts; needsTSO = params->needsTSO; resetState(); @@ -178,7 +183,10 @@ storeHead = storeWBIdx = storeTail = 0; - usedPorts = 0; + //Reset Counters + lsq->usedReadPorts = 0; + lsq->usedSharedPorts = 0; + lsq->usedWritePorts = 0; retryPkt = NULL; memDepViolator = NULL; @@ -246,6 +254,14 @@ lsqCacheBlocked .name(name() + ".cacheBlocked") .desc("Number of times an access to memory failed due to the cache being blocked"); + + cacheReadReqPortBlocked + .name(name() + ".cacheReadReqPortBlocked") + .desc("Number of times an access to cache failed due to the lack of free cache read ports for load requests"); + + cacheWriteReqPortBlocked + .name(name() + ".cacheWriteReqPortBlocked") + .desc("Number of times an access to cache failed due to the lack of free cache write ports for store requests"); } template @@ -492,7 +508,8 @@ */ while (load_idx != loadTail) { DynInstPtr ld_inst = loadQueue[load_idx]; - if (!ld_inst->effAddrValid() || ld_inst->uncacheable()) { + if (!ld_inst->effAddrValid() || ld_inst->uncacheable() + || ld_inst->cacheAccessDelayed()) { incrLdIdx(load_idx); continue; } @@ -576,6 +593,13 @@ load_fault == NoFault) return load_fault; + if (inst->cacheAccessDelayed() && + load_fault == NoFault) { + DPRINTF(LSQUnit, "load PC %s, [sn:%lli] was delayed due to lack of " + "free cache ports\n", inst->pcState(), inst->seqNum); + return load_fault; + } + // If the instruction faulted or predicated false, then we need to send it // along to commit without the instruction completing. if (load_fault != NoFault || inst->readPredicate() == false) { @@ -718,12 +742,45 @@ template void +LSQUnit::readPendingSplitLoad() +{ + //since this is the first packet we send this cycle, all dedicated cache + //read ports and shared read/write ports should be available. + assert(lsq->usedReadPorts == 0 && lsq->usedSharedPorts == 0); + assert(lsq->hasPendingSplitPkt); + assert(pendingSplitPkt != NULL); + lsq->incCacheReadPorts(); + + if (!dcachePort->sendTimingReq(pendingSplitPkt)) { + DPRINTF(LSQUnit, "sendTiming pkt for second addr (split):%#x failed!\n", pendingSplitPkt->getAddr()); + // The main packet will be deleted in completeDataAccess. + delete pendingSplitPkt->req; + delete pendingSplitPkt; + + pendingSplitState->complete(); + + //retryId should be -1 here. + lsq->setRetryTid(lsqID); + } + + pendingSplitPkt = NULL; + pendingSplitState = NULL; +} + +template +void LSQUnit::writebackPendingStore() { + //since this is the first packet we send this cycle, all dedicated cache + //write ports should be available, but shared ports might have already + //been used by load instr. + assert(lsq->usedWritePorts == 0); + if (hasPendingPkt) { assert(pendingPkt != NULL); // If the cache is blocked, this will store the packet for retry. + DPRINTF(LSQUnit, "write back a pending store\n"); if (sendStore(pendingPkt)) { storePostSend(pendingPkt); } @@ -746,8 +803,16 @@ storeWBIdx != storeTail && storeQueue[storeWBIdx].inst && storeQueue[storeWBIdx].canWB && - ((!needsTSO) || (!storeInFlight)) && - usedPorts < cachePorts) { + ((!needsTSO) || (!storeInFlight)) + ) { + + //Check to see if there is enough port to send this instr + if (!lsq->freeCacheWritePorts()) { + DPRINTF(LSQUnit, "Unable to write back any more stores, " + "no more free cache write ports!\n"); + ++cacheWriteReqPortBlocked; + break; + } if (isStoreBlocked || lsq->cacheBlocked()) { DPRINTF(LSQUnit, "Unable to write back any more stores, cache" @@ -765,8 +830,6 @@ continue; } - ++usedPorts; - if (storeQueue[storeWBIdx].inst->isDataPrefetch()) { incrStIdx(storeWBIdx); @@ -899,14 +962,12 @@ state->pendingPacket = snd_data_pkt; } } else { - // If split, try to send the second packet too if (split) { assert(snd_data_pkt); // Ensure there are enough ports to use. - if (usedPorts < cachePorts) { - ++usedPorts; + if (lsq->freeCacheWritePorts()) { if (sendStore(snd_data_pkt)) { storePostSend(snd_data_pkt); } else { @@ -915,9 +976,11 @@ inst->seqNum); } } else { - + DPRINTF(LSQUnit, "Unable to write back split pkt, " + "no more free cache write ports!\n"); + ++cacheWriteReqPortBlocked; // Store the packet for when there's free ports. - assert(pendingPkt == NULL); + assert(!hasPendingPkt); pendingPkt = snd_data_pkt; hasPendingPkt = true; } @@ -929,9 +992,6 @@ } } - // Not sure this should set it to 0. - usedPorts = 0; - assert(stores >= 0 && storesToWB >= 0); } @@ -1087,6 +1147,8 @@ // Squashed instructions do not need to complete their access. if (inst->isSquashed()) { + DPRINTF(LSQUnit,"Writeback: squashed instruction PC %s, " + "[sn:%lli]\n", inst->pcState(), inst->seqNum); iewStage->decrWb(inst->seqNum); assert(!inst->isStore()); ++lsqIgnoredResponses; @@ -1094,6 +1156,8 @@ } if (!inst->isExecuted()) { + DPRINTF(LSQUnit,"Writeback: non-executed instruction PC %s, " + "[sn:%lli]\n", inst->pcState(), inst->seqNum); inst->setExecuted(); // Complete access to copy data to proper place. @@ -1165,6 +1229,9 @@ bool LSQUnit::sendStore(PacketPtr data_pkt) { + //one write port is going to be used + lsq->incCacheWritePorts(); + if (!dcachePort->sendTimingReq(data_pkt)) { // Need to handle becoming blocked on a store. isStoreBlocked = true; @@ -1188,6 +1255,9 @@ LSQSenderState *state = dynamic_cast(retryPkt->senderState); + //Write ports are used here at this tick and they are reset in + //tick(). so they might be used again in the same tick. + if (dcachePort->sendTimingReq(retryPkt)) { // Don't finish the store unless this is the last packet. if (!TheISA::HasUnalignedMemAcc || !state->pktToSend || @@ -1202,8 +1272,20 @@ // Send any outstanding packet. if (TheISA::HasUnalignedMemAcc && state->pktToSend) { assert(state->pendingPacket); - if (sendStore(state->pendingPacket)) { - storePostSend(state->pendingPacket); + //check for free ports + if (lsq->freeCacheWritePorts()) { + if (sendStore(state->pendingPacket)) { + storePostSend(state->pendingPacket); + } + } else { //No free cache write port + DPRINTF(LSQUnit, "Unable to write back any more stores " + "when recieved a retry, no more free cache write " + "ports!\n"); + ++cacheWriteReqPortBlocked; + // Store the packet for when there's free ports. + assert(!hasPendingPkt); + pendingPkt = state->pendingPacket; + hasPendingPkt = true; } } } else { diff --git a/src/mem/cache/BaseCache.py b/src/mem/cache/BaseCache.py --- a/src/mem/cache/BaseCache.py +++ b/src/mem/cache/BaseCache.py @@ -70,3 +70,6 @@ mem_side = MasterPort("Port on side closer to MEM") addr_ranges = VectorParam.AddrRange([AllMemory], "The address range for the CPU-side port") system = Param.System(Parent.any, "System we belong to") + cacheReadPorts = Param.Int(-1, "# of dedicated read ports") + cacheSharedPorts = Param.Int(-1, "# of shared read/write ports") + cacheWritePorts = Param.Int(-1, "# of dedicated write ports") diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh --- a/src/mem/cache/base.hh +++ b/src/mem/cache/base.hh @@ -171,7 +171,8 @@ protected: CacheSlavePort(const std::string &_name, BaseCache *_cache, - const std::string &_label); + const std::string &_label, int _cacheReadPorts, + int _cacheSharedPorts, int _cacheWritePorts); /** A normal packet queue used to store responses. */ SlavePacketQueue queue; diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc --- a/src/mem/cache/base.cc +++ b/src/mem/cache/base.cc @@ -57,8 +57,12 @@ BaseCache::CacheSlavePort::CacheSlavePort(const std::string &_name, BaseCache *_cache, - const std::string &_label) - : QueuedSlavePort(_name, _cache, queue), queue(*_cache, *this, _label), + const std::string &_label, + int _cacheReadPorts, + int _cacheSharedPorts, + int _cacheWritePorts) + : QueuedSlavePort(_name, _cache, queue), queue(*_cache, *this, _label, + _cacheReadPorts, _cacheSharedPorts, _cacheWritePorts), blocked(false), mustSendRetry(false), sendRetryEvent(this) { } @@ -101,7 +105,8 @@ DPRINTF(CachePort, "Cache port %s sending retry\n", name()); mustSendRetry = false; // @TODO: need to find a better time (next bus cycle?) - owner.schedule(sendRetryEvent, curTick() + 1); + //Use owner's nextCycle, better than curTick()+1. + owner.schedule(sendRetryEvent, owner.nextCycle()); } } diff --git a/src/mem/cache/cache.hh b/src/mem/cache/cache.hh --- a/src/mem/cache/cache.hh +++ b/src/mem/cache/cache.hh @@ -107,7 +107,8 @@ public: CpuSidePort(const std::string &_name, Cache *_cache, - const std::string &_label); + const std::string &_label, int _cacheReadPorts, + int _cacheSharedPorts, int _cacheWritePorts); }; @@ -128,7 +129,8 @@ MemSidePacketQueue(Cache &cache, MasterPort &port, const std::string &label) : - MasterPacketQueue(cache, port, label), cache(cache) { } + MasterPacketQueue(cache, port, label), + cache(cache) { } /** * Override the normal sendDeferredPacket and do not only diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh --- a/src/mem/cache/cache_impl.hh +++ b/src/mem/cache/cache_impl.hh @@ -73,10 +73,11 @@ tempBlock->data = new uint8_t[blkSize]; cpuSidePort = new CpuSidePort(p->name + ".cpu_side", this, - "CpuSidePort"); + "CpuSidePort", + p->cacheReadPorts, p->cacheSharedPorts, + p->cacheWritePorts); memSidePort = new MemSidePort(p->name + ".mem_side", this, "MemSidePort"); - tags->setCache(this); if (prefetcher) prefetcher->setCache(this); @@ -1711,8 +1712,11 @@ template Cache:: CpuSidePort::CpuSidePort(const std::string &_name, Cache *_cache, - const std::string &_label) - : BaseCache::CacheSlavePort(_name, _cache, _label), cache(_cache) + const std::string &_label, int _cacheReadPorts, + int _cacheSharedPorts, int _cacheWritePorts) + : BaseCache::CacheSlavePort(_name, _cache, _label, _cacheReadPorts, + _cacheSharedPorts, _cacheWritePorts), + cache(_cache) { } diff --git a/src/mem/packet_queue.hh b/src/mem/packet_queue.hh --- a/src/mem/packet_queue.hh +++ b/src/mem/packet_queue.hh @@ -247,6 +247,35 @@ SlavePort& slavePort; + private: + /** counters used for keeping the number of used ports in each cycle */ + int usedReadPorts; + int usedSharedPorts; + int usedWritePorts; + + // number of dedicated l1 cache read ports + const int cacheReadPorts; + // number of shared l1 cache read/write ports + const int cacheSharedPorts; + // number of dedicated l1 cache write ports + const int cacheWritePorts; + + // used only to reset port counters at the right time + Tick nextSendCycle; + + /* Whether or not there is any free cache read ports remaining + * among all dedicated and shared ports. */ + bool freeCacheReadPorts(); + /* Whether or not there is any free cache write ports remaining + * among all dedicated and shared ports. */ + bool freeCacheWritePorts(); + /* Increment the number of used read ports wrt dedicated and shared + * ports. */ + void incCacheReadPorts(); + /* Increment the number of used write ports wrt dedicated and shared + * ports. */ + void incCacheWritePorts(); + public: /** @@ -259,7 +288,9 @@ * @param _label Label to push on the label stack for print request packets */ SlavePacketQueue(EventManager& _em, SlavePort& _slavePort, - const std::string _label = "SlavePacketQueue"); + const std::string _label = "SlavePacketQueue", + int cacheReadPorts = -1, int cacheSharedPorts = -1, + int cacheWritePorts = -1); virtual ~SlavePacketQueue() { } diff --git a/src/mem/packet_queue.cc b/src/mem/packet_queue.cc --- a/src/mem/packet_queue.cc +++ b/src/mem/packet_queue.cc @@ -45,6 +45,8 @@ #include "debug/Drain.hh" #include "debug/PacketQueue.hh" #include "mem/packet_queue.hh" +#include "sim/clocked_object.hh" +#include "mem/mem_object.hh" using namespace std; @@ -228,6 +230,44 @@ } bool +SlavePacketQueue::freeCacheReadPorts() +{ + return ((usedReadPorts < cacheReadPorts) || + (usedSharedPorts < cacheSharedPorts)); +} + +bool +SlavePacketQueue::freeCacheWritePorts() +{ + return ((usedWritePorts < cacheWritePorts) || + (usedSharedPorts < cacheSharedPorts)); +} + +void +SlavePacketQueue::incCacheReadPorts() +{ + assert(freeCacheReadPorts()); + if (usedReadPorts < cacheReadPorts) + ++usedReadPorts; + else { //we should use shared ports + assert(usedSharedPorts < cacheSharedPorts); + ++usedSharedPorts; + } +} + +void +SlavePacketQueue::incCacheWritePorts() +{ + assert(freeCacheWritePorts()); + if (usedWritePorts < cacheWritePorts) + ++usedWritePorts; + else { //we should use shared ports + assert(usedSharedPorts < cacheSharedPorts); + ++usedSharedPorts; + } +} + +bool MasterPacketQueue::sendTiming(PacketPtr pkt, bool send_as_snoop) { // attempt to send the packet and return according to the outcome @@ -238,9 +278,18 @@ } SlavePacketQueue::SlavePacketQueue(EventManager& _em, SlavePort& _slavePort, - const std::string _label) - : PacketQueue(_em, _label), slavePort(_slavePort) + const std::string _label, + int _cacheReadPorts, + int _cacheSharedPorts, + int _cacheWritePorts) + : PacketQueue(_em, _label), slavePort(_slavePort), + cacheReadPorts(_cacheReadPorts), cacheSharedPorts(_cacheSharedPorts), + cacheWritePorts(_cacheWritePorts) { + nextSendCycle = 0; + usedReadPorts = 0; + usedSharedPorts = 0; + usedWritePorts = 0; } bool @@ -248,5 +297,52 @@ { // we should never have queued snoop requests assert(!send_as_snoop); - return slavePort.sendTimingResp(pkt); + + //if # of ports is not bounded + if (cacheReadPorts < 0) + return slavePort.sendTimingResp(pkt); + + //if # of ports is bounded + //This is stupid, but we don't have tick() function to reset counters + //we could schedule an event to do it. + if (curTick() >= nextSendCycle) { + nextSendCycle = slavePort.getOwner().nextIncomingCycle(); + usedReadPorts = 0; + usedSharedPorts = 0; + usedWritePorts = 0; + } + + // attempt to send the packet and return according to the outcome + if (pkt->isRead()) { + if (freeCacheReadPorts()) { + incCacheReadPorts(); + DPRINTF(PacketQueue, "sending read resp pkt %#x\n", pkt->getAddr()); + return slavePort.sendTimingResp(pkt); + } else { //no free cache read port + //we schedule a send timing for next cycle + DPRINTF(PacketQueue, "Scheduling read resp pkt %#x for tick %lli " + "due to lack of free ports\n", pkt->getAddr(), + nextSendCycle); + schedSendTiming(pkt, nextSendCycle); + //return true which means the other side is not blocked + //so we don't expect any recvRetry and no waitingOnRetry is set + return true; + } + } + else if (pkt->isWrite()) { + if (freeCacheWritePorts()) { + incCacheWritePorts(); + DPRINTF(PacketQueue, "sending write resp pkt %#x\n", + pkt->getAddr()); + return slavePort.sendTimingResp(pkt); + } else { //no free cache write port + DPRINTF(PacketQueue, "Scheduling write resp pkt %#x for tick %lli " + "due to lack of free ports\n", pkt->getAddr(), + nextSendCycle); + schedSendTiming(pkt, nextSendCycle); + return true; + } + } else { + panic("Unknown packet type!"); + } } diff --git a/src/mem/port.hh b/src/mem/port.hh --- a/src/mem/port.hh +++ b/src/mem/port.hh @@ -114,6 +114,9 @@ /** Get the port id. */ PortID getId() const { return id; } + /** Get the port owner. */ + MemObject& getOwner(); + }; /** Forward declaration */ diff --git a/src/mem/port.cc b/src/mem/port.cc --- a/src/mem/port.cc +++ b/src/mem/port.cc @@ -59,6 +59,12 @@ { } +MemObject& +Port::getOwner() +{ + return owner; +} + BaseMasterPort::BaseMasterPort(const std::string& name, MemObject* owner, PortID _id) : Port(name, *owner, _id), _baseSlavePort(NULL) diff --git a/src/sim/clocked_object.hh b/src/sim/clocked_object.hh --- a/src/sim/clocked_object.hh +++ b/src/sim/clocked_object.hh @@ -180,6 +180,18 @@ Tick nextCycle() const { return clockEdge(); } + /** + * Since nextCycle() returns the current clock edge if it is called at + * clock edge, nextIncomingCycle() returns the next incoming clock edge. + */ + Tick nextIncomingCycle() const + { + Tick clockEdgeTick = clockEdge(); + if (clockEdgeTick == curTick()) + return clockEdgeTick + clock; + return clockEdgeTick; + } + inline uint64_t frequency() const { return SimClock::Frequency / clock; } inline Tick clockPeriod() const { return clock; }