diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -106,6 +106,8 @@ iewStage->decrWb(inst->seqNum); } else { if (!state->noWB) { + DPRINTF(LSQUnit, "Complete DataAccess for load: %d, store: %d, PC %s, [sn:%lli]\n", + inst->isLoad(), inst->isStore(), inst->pcState(), inst->seqNum); if (!TheISA::HasUnalignedMemAcc || !state->isSplit || !state->isLoad) { writeback(inst, pkt); @@ -115,6 +117,8 @@ } if (inst->isStore()) { + DPRINTF(LSQUnit, "Complete DataAccess for store PC %s, [sn:%lli]\n", + inst->pcState(), inst->seqNum); completeStore(state->idx); } } @@ -132,7 +136,8 @@ LSQUnit::LSQUnit() : loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false), isStoreBlocked(false), isLoadBlocked(false), - loadBlockedHandled(false), storeInFlight(false), hasPendingPkt(false) + loadBlockedHandled(false), storeInFlight(false), hasPendingPkt(false), + pendingSplitPkt(NULL), sendRetryEvent(this) { } @@ -169,9 +174,6 @@ storeHead = storeWBIdx = storeTail = 0; - usedPorts = 0; - cachePorts = params->cachePorts; - retryPkt = NULL; memDepViolator = NULL; @@ -233,6 +235,14 @@ lsqCacheBlocked .name(name() + ".cacheBlocked") .desc("Number of times an access to memory failed due to the cache being blocked"); + + cacheReadReqPortBlocked + .name(name() + ".cacheReadReqPortBlocked") + .desc("Number of times an access to cache failed due to the lack of free cache read ports for load requests"); + + cacheWriteReqPortBlocked + .name(name() + ".cacheWriteReqPortBlocked") + .desc("Number of times an access to cache failed due to the lack of free cache write ports for store requests"); } template @@ -244,6 +254,13 @@ template void +LSQUnit::sendCacheRetry() +{ + dcachePort->sendRetry(); +} + +template +void LSQUnit::clearLQ() { loadQueue.clear(); @@ -280,7 +297,11 @@ storeHead = storeWBIdx = storeTail = 0; - usedPorts = 0; + //Reset Counters + lsq->usedReadPorts = 0; + lsq->usedWritePorts = 0; + lsq->usedReadRespPorts = 0; + lsq->usedWriteRespPorts = 0; memDepViolator = NULL; @@ -505,7 +526,8 @@ */ while (load_idx != loadTail) { DynInstPtr ld_inst = loadQueue[load_idx]; - if (!ld_inst->effAddrValid() || ld_inst->uncacheable()) { + if (!ld_inst->effAddrValid() || ld_inst->uncacheable() + || ld_inst->cacheAccessDelayed()) { incrLdIdx(load_idx); continue; } @@ -589,6 +611,13 @@ load_fault == NoFault) return load_fault; + if (inst->cacheAccessDelayed() && + load_fault == NoFault) { + DPRINTF(LSQUnit, "load PC %s, [sn:%lli] was delayed due to lack of free cache ports\n", + inst->pcState(), inst->seqNum); + return load_fault; + } + // If the instruction faulted or predicated false, then we need to send it // along to commit without the instruction completing. if (load_fault != NoFault || inst->readPredicate() == false) { @@ -731,8 +760,37 @@ template void +LSQUnit::readPendingSplitLoad() +{ + assert(lsq->usedReadPorts == 0); + assert(lsq->hasPendingSplitPkt); + assert(pendingSplitPkt != NULL); + ++lsq->usedReadPorts; + + if (!dcachePort->sendTimingReq(pendingSplitPkt)) { + DPRINTF(LSQUnit, "sendTiming pkt for second addr (split):%#x failed!\n", pendingSplitPkt->getAddr()); + // The main packet will be deleted in completeDataAccess. + delete pendingSplitPkt->req; + delete pendingSplitPkt; + + pendingSplitState->complete(); + + //retryId should be -1 here. + lsq->setRetryTid(lsqID); + } + + pendingSplitPkt = NULL; + pendingSplitState = NULL; +} + +template +void LSQUnit::writebackPendingStore() { + //since this is the first packet we send this cycle, all + //cache write ports should be available. + assert(lsq->usedWritePorts == 0); + if (hasPendingPkt) { assert(pendingPkt != NULL); @@ -759,8 +817,17 @@ storeWBIdx != storeTail && storeQueue[storeWBIdx].inst && storeQueue[storeWBIdx].canWB && - ((!needsTSO) || (!storeInFlight)) && - usedPorts < cachePorts) { + ((!needsTSO) || (!storeInFlight)) + ) { + + //Check to see if there is enough port to send this instr + if (lsq->usedWritePorts >= lsq->cacheWritePorts) { + DPRINTF(LSQUnit, "Unable to write back any more stores, " + "no more free cache write ports!\n"); + ++cacheWriteReqPortBlocked; + break; + + } if (isStoreBlocked || lsq->cacheBlocked()) { DPRINTF(LSQUnit, "Unable to write back any more stores, cache" @@ -778,8 +845,6 @@ continue; } - ++usedPorts; - if (storeQueue[storeWBIdx].inst->isDataPrefetch()) { incrStIdx(storeWBIdx); @@ -912,14 +977,12 @@ state->pendingPacket = snd_data_pkt; } } else { - // If split, try to send the second packet too if (split) { assert(snd_data_pkt); // Ensure there are enough ports to use. - if (usedPorts < cachePorts) { - ++usedPorts; + if (lsq->usedWritePorts < lsq->cacheWritePorts) { if (sendStore(snd_data_pkt)) { storePostSend(snd_data_pkt); } else { @@ -928,7 +991,7 @@ inst->seqNum); } } else { - + ++cacheWriteReqPortBlocked; // Store the packet for when there's free ports. assert(pendingPkt == NULL); pendingPkt = snd_data_pkt; @@ -942,9 +1005,6 @@ } } - // Not sure this should set it to 0. - usedPorts = 0; - assert(stores >= 0 && storesToWB >= 0); } @@ -1100,6 +1160,8 @@ // Squashed instructions do not need to complete their access. if (inst->isSquashed()) { + DPRINTF(LSQUnit,"Writeback: squashed instruction PC %s, " + "[sn:%lli]\n", inst->pcState(), inst->seqNum); iewStage->decrWb(inst->seqNum); assert(!inst->isStore()); ++lsqIgnoredResponses; @@ -1107,6 +1169,8 @@ } if (!inst->isExecuted()) { + DPRINTF(LSQUnit,"Writeback: non-executed instruction PC %s, " + "[sn:%lli]\n", inst->pcState(), inst->seqNum); inst->setExecuted(); // Complete access to copy data to proper place. @@ -1178,6 +1242,9 @@ bool LSQUnit::sendStore(PacketPtr data_pkt) { + //one write port is going to be used + lsq->usedWritePorts++; + if (!dcachePort->sendTimingReq(data_pkt)) { // Need to handle becoming blocked on a store. isStoreBlocked = true; @@ -1201,6 +1268,10 @@ LSQSenderState *state = dynamic_cast(retryPkt->senderState); + //There should be free cache write ports + assert(lsq->usedWritePorts < lsq->cacheWritePorts); + lsq->usedWritePorts++; + if (dcachePort->sendTimingReq(retryPkt)) { // Don't finish the store unless this is the last packet. if (!TheISA::HasUnalignedMemAcc || !state->pktToSend || @@ -1215,8 +1286,17 @@ // Send any outstanding packet. if (TheISA::HasUnalignedMemAcc && state->pktToSend) { assert(state->pendingPacket); - if (sendStore(state->pendingPacket)) { - storePostSend(state->pendingPacket); + //check for free ports + if (lsq->usedWritePorts < lsq->cacheWritePorts) { + if (sendStore(state->pendingPacket)) { + storePostSend(state->pendingPacket); + } + } else { //No free cache write port + ++cacheWriteReqPortBlocked; + // Store the packet for when there's free ports. + assert(pendingPkt == NULL); + pendingPkt = state->pendingPacket; + hasPendingPkt = true; } } } else { diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -55,6 +55,8 @@ template LSQ::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params) : cpu(cpu_ptr), iewStage(iew_ptr), + cacheReadPorts(params->cacheReadPorts), + cacheWritePorts(params->cacheWritePorts), LQEntries(params->LQEntries), SQEntries(params->SQEntries), numThreads(params->numThreads), @@ -113,6 +115,15 @@ maxLQEntries, maxSQEntries, tid); thread[tid].setDcachePort(&cpu_ptr->getDataPort()); } + + //Reset Counters + usedReadPorts = 0; + usedWritePorts = 0; + nextRecvCycle = 0; + usedReadRespPorts = 0; + usedWriteRespPorts = 0; + hasPendingSplitPkt = false; + pendingSplitThreadId = -1; } @@ -127,6 +138,14 @@ void LSQ::regStats() { + cacheReadRespPortBlocked + .name(name() + ".cacheReadRespPortBlocked") + .desc("Number of times a read resp from cache was rejected due to the lack of free cache read ports"); + + cacheWriteRespPortBlocked + .name(name() + ".cacheWriteRespPortBlocked") + .desc("Number of times a write resp from cache was rejected due to the lack of free cache write ports"); + //Initialize LSQs for (ThreadID tid = 0; tid < numThreads; tid++) { thread[tid].regStats(); @@ -218,6 +237,11 @@ void LSQ::tick() { + //Reseting counter + DPRINTF(LSQ, "LSQ tick\n"); + usedReadPorts = 0; + usedWritePorts = 0; + list::iterator threads = activeThreads->begin(); list::iterator end = activeThreads->end(); @@ -266,6 +290,18 @@ template void +LSQ::readPendingSplitLoad() +{ + if (hasPendingSplitPkt) { + assert(pendingSplitThreadId >= 0); + thread[pendingSplitThreadId].readPendingSplitLoad(); + hasPendingSplitPkt = false; + pendingSplitThreadId = -1; + } +} + +template +void LSQ::writebackStores() { list::iterator threads = activeThreads->begin(); @@ -319,11 +355,80 @@ template bool +LSQ::isRespBlocked(PacketPtr pkt) +{ + /* recvTiming() might be called before cpu's tick() is called + * (although in the same tick). So when cache calls recvTiming() before + * cpu't tick() is called, usedReadRespPorts and usedWriteRespPorts have + * not yet initialized for this cycle. So they mighe be having improper + * values. That's why we here check to see if a new cycle has started or + * not and if so, we initialize them. It is messy and slow, but I could + * not think of any better way. + * usedReadRespPorts and usedWriteRespPorts should be always read after + * executing the few lines below, otherwise they might have wrong values. + * */ + if (curTick() >= nextRecvCycle) { + nextRecvCycle = cpu->clockEdge(Cycles(1)); + usedReadRespPorts = 0; + usedWriteRespPorts = 0; + } + + // if pkt is not response and not invalidate, this cpu + // is not the intended dest. so if not a invalidate snoop from + // dcache or not a read response, we just ignore it. + if (pkt->isResponse() || pkt->isInvalidate()) { + if (pkt->isRead()) { + if (usedReadRespPorts < cacheReadPorts) { + ++usedReadRespPorts; + } else { //no free read resp ports left + DPRINTF(LSQ, "read resp pkt for addr:%#x was rejected\n", pkt->getAddr()); + ++cacheReadRespPortBlocked; + //Now the recv gets blocked in this cycle, and we have + //to sendRetry next cycle to unblock it. This also prevents + //O3 from receiving write responses. but I don't have any + //control on blocking read and write responses separately. + cpu->schedule(thread[pkt->req->threadId()].sendRetryEvent, cpu->clockEdge(Cycles(1))); + return true; + } + } + + if (pkt->isWrite()) { + if (usedWriteRespPorts < cacheWritePorts) { + ++usedWriteRespPorts; + } else { //no free write resp ports left + DPRINTF(LSQ, "write resp pkt for addr:%#x was rejected\n", pkt->getAddr()); + ++cacheWriteRespPortBlocked; + //Now the recv gets blocked in this cycle, and we have + //to sendRetry next cycle to unblock it. This also prevents + //O3 from receiving read responses. but I don't have any + //control on blocking read and write responses separately. + cpu->schedule(thread[pkt->req->threadId()].sendRetryEvent, cpu->clockEdge(Cycles(1))); + return true; + } + } + } + + //not blocked. + return false; +} + +template +bool LSQ::recvTimingResp(PacketPtr pkt) { + assert(pkt->isResponse()); + + DPRINTF(LSQ, "received pkt for addr:%#x %s, isLLSC: %d, isRead: %d, isWrite: %d, isReadWrite: %d\n", + pkt->getAddr(), pkt->cmdString(), pkt->isLLSC(), pkt->isRead(), pkt->isWrite(), pkt->isReadWrite()); + if (pkt->isError()) DPRINTF(LSQ, "Got error packet back for address: %#X\n", pkt->getAddr()); + + //check for free ports etc + if (isRespBlocked(pkt)) + return false; + thread[pkt->req->threadId()].completeDataAccess(pkt); return true; } @@ -332,8 +437,13 @@ void LSQ::recvTimingSnoopReq(PacketPtr pkt) { - DPRINTF(LSQ, "received pkt for addr:%#x %s\n", pkt->getAddr(), - pkt->cmdString()); + assert(pkt->isRequest()); + DPRINTF(LSQ, "received snoop pkt for addr:%#x %s, isLLSC: %d, isRead: %d, isWrite: %d, isReadWrite: %d\n", + pkt->getAddr(), pkt->cmdString(), pkt->isLLSC(), pkt->isRead(), pkt->isWrite(), pkt->isReadWrite()); + + //check for free ports + //if (isRespBlocked(pkt)) + // return false; // must be a snoop if (pkt->isInvalidate()) { diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -49,6 +49,7 @@ #include "mem/packet.hh" #include "mem/port.hh" #include "sim/fault_fwd.hh" +#include "sim/eventq.hh" struct DerivO3CPUParams; @@ -91,6 +92,9 @@ /** Sets the pointer to the dcache port. */ void setDcachePort(MasterPort *dcache_port); + /**Calls sendRetry on dcachePort. */ + void sendCacheRetry(); + /** Switches out LSQ unit. */ void switchOut(); @@ -105,7 +109,7 @@ * @todo: Move the number of used ports up to the LSQ level so it can * be shared by all LSQ units. */ - void tick() { usedPorts = 0; } + void tick() { } /** Inserts an instruction. */ void insert(DynInstPtr &inst); @@ -416,13 +420,6 @@ /** The index of the tail instruction in the SQ. */ int storeTail; - /// @todo Consider moving to a more advanced model with write vs read ports - /** The number of cache ports available each cycle. */ - int cachePorts; - - /** The number of used cache ports in this cycle. */ - int usedPorts; - /** Is the LSQ switched out. */ bool switchedOut; @@ -471,6 +468,13 @@ /** The packet that is pending free cache ports. */ PacketPtr pendingPkt; + /** The split load packet that pending free cache read request ports. */ + PacketPtr pendingSplitPkt; + + /** The load state of pending split packet that pending free cache read + * request ports. */ + LSQSenderState *pendingSplitState; + /** Flag for memory model. */ bool needsTSO; @@ -508,6 +512,12 @@ /** Number of times the LSQ is blocked due to the cache. */ Stats::Scalar lsqCacheBlocked; + /** Number of times the load request execution in the LSQ is denied due to lack of free read cache ports. */ + Stats::Scalar cacheReadReqPortBlocked; + + /** Number of times the store request execution in the LSQ is denied due to lack of free write cache ports. */ + Stats::Scalar cacheWriteReqPortBlocked; + public: /** Executes the load at the given index. */ Fault read(Request *req, Request *sreqLow, Request *sreqHigh, @@ -545,6 +555,14 @@ /** Returns whether or not the LSQ unit is stalled. */ bool isStalled() { return stalled; } + + //EventWrapper for sendRetryEvent + EventWrapper sendRetryEvent; + + /** Does a read for a split load that couldn't be completed + * the previous cycle. */ + void readPendingSplitLoad(); + }; template @@ -631,6 +649,9 @@ delete fst_data_pkt; delete snd_data_pkt; } + DPRINTF(LSQUnit, "Scheduling writeback event for load [sn:%lli] " + "PC %s due to MmappedIpr\n", + load_inst->seqNum, load_inst->pcState()); WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this); cpu->schedule(wb, cpu->clockEdge(delay)); return NoFault; @@ -692,7 +713,14 @@ PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq); data_pkt->dataStatic(load_inst->memData); + + DPRINTF(LSQUnit, "Scheduling writeback event for load [sn:%lli] " + "PC %s due to store to load forwarding\n", + load_inst->seqNum, load_inst->pcState()); + WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this); + //To disable cacheAccessDelayed + load_inst->cacheAccessDelayed(false); // We'll say this has a 1 cycle load-store forwarding latency // for now. @@ -736,6 +764,8 @@ iewStage->rescheduleMemInst(load_inst); iewStage->decrWb(load_inst->seqNum); load_inst->clearIssued(); + //Disable cache access delayed + load_inst->cacheAccessDelayed(false); ++lsqRescheduledLoads; // Do not generate a writeback event as this instruction is not @@ -758,14 +788,30 @@ } // If there's no forwarding case, then go access memory + + /*Checking for free read ports before accessing memory */ + if (lsq->usedReadPorts >= lsq->cacheReadPorts) { + ++cacheReadReqPortBlocked; + DPRINTF(LSQUnit, "no free cache read ports to send load requests.\n"); + //NOTE: this instr does not lookup TLB any more, because + //translationStarted and translationCompleted are already set. + + load_inst->cacheAccessDelayed(true); + load_inst->savedReq = req; + load_inst->savedSreqLow = sreqLow; + load_inst->savedSreqHigh = sreqHigh; + return NoFault; + } + + load_inst->cacheAccessDelayed(false); + /*End checking */ + DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n", load_inst->seqNum, load_inst->pcState()); assert(!load_inst->memData); load_inst->memData = new uint8_t[64]; - ++usedPorts; - // if we the cache is not blocked, do cache access bool completedFirst = false; if (!lsq->cacheBlocked()) { @@ -804,7 +850,11 @@ state->mainPkt = data_pkt; } + DPRINTF(LSQUnit, "sendTiming pkt for addr:%#x\n", fst_data_pkt->getAddr()); + ++lsq->usedReadPorts; + if (!dcachePort->sendTimingReq(fst_data_pkt)) { + DPRINTF(LSQUnit, "sendTiming pkt for addr:%#x failed!\n", fst_data_pkt->getAddr()); // Delete state and data packet because a load retry // initiates a pipeline restart; it does not retry. delete state; @@ -832,19 +882,38 @@ // load will be squashed, so indicate this to the state object. // The first packet will return in completeDataAccess and be // handled there. - ++usedPorts; - if (!dcachePort->sendTimingReq(snd_data_pkt)) { - // The main packet will be deleted in completeDataAccess. - delete snd_data_pkt->req; - delete snd_data_pkt; + DPRINTF(LSQUnit, "sendTiming pkt for second addr (split):%#x\n", snd_data_pkt->getAddr()); + if (lsq->usedReadPorts < lsq->cacheReadPorts) { + ++lsq->usedReadPorts; + if (!dcachePort->sendTimingReq(snd_data_pkt)) { - state->complete(); + DPRINTF(LSQUnit, "sendTiming pkt for second addr (split):%#x failed!\n", snd_data_pkt->getAddr()); + // The main packet will be deleted in completeDataAccess. + delete snd_data_pkt->req; + delete snd_data_pkt; - req = NULL; - sreqHigh = NULL; + state->complete(); - lsq->setRetryTid(lsqID); + req = NULL; + sreqHigh = NULL; + + lsq->setRetryTid(lsqID); + } + } else { + //no free cache read ports + ++cacheReadReqPortBlocked; + + //We save the packet to be sent next cycle + //There should be only one pending split packet for all + //threads, since once cache is blocked for one thread, all + //other threads should not be able to do more loads. + assert(!lsq->cacheBlocked()); + assert(pendingSplitPkt == NULL); + pendingSplitPkt = snd_data_pkt; + pendingSplitState = state; + lsq->hasPendingSplitPkt = true; + lsq->pendingSplitThreadId = load_inst->threadNumber; } } } diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -150,6 +150,7 @@ IsUncacheable, ReqMade, MemOpDone, + CacheAccessDelayed, MaxFlags }; @@ -238,8 +239,9 @@ /////////////////////// TLB Miss ////////////////////// /** - * Saved memory requests (needed when the DTB address translation is - * delayed due to a hw page table walk). + * Saved memory requests (needed when (1) the DTB address translation is + * delayed due to a hw page table walk (2) load cannot be executed due + * to lack of cache ports). */ RequestPtr savedReq; RequestPtr savedSreqLow; @@ -357,6 +359,13 @@ return (translationStarted() && !translationCompleted()); } + /** True if the instruction was delayed because of lack of cache + * ports while sitting in the LSQ. It this is true, the instruction is + * tried to execute again. + */ + bool cacheAccessDelayed() const { return instFlags[CacheAccessDelayed]; } + void cacheAccessDelayed(bool f) { instFlags[CacheAccessDelayed] = f; } + public: #ifdef DEBUG void dumpSNList(); diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py --- a/src/cpu/o3/O3CPU.py +++ b/src/cpu/o3/O3CPU.py @@ -37,7 +37,8 @@ type = 'DerivO3CPU' activity = Param.Unsigned(0, "Initial count") - cachePorts = Param.Unsigned(200, "Cache Ports") + cacheReadPorts = Param.Unsigned(1, "Cache Read Ports") + cacheWritePorts = Param.Unsigned(1, "Cache Write Ports") decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay") renameToFetchDelay = Param.Cycles(1 ,"Rename to fetch delay") diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -1286,6 +1286,15 @@ continue; } + //to make sure we don't load more than the available cache read ports + if (inst->cacheAccessDelayed() && fault == NoFault) { + // Not enough cache read port now, we should wait + DPRINTF(IEW, "Execute: Delayed cache Access, deferring " + "load.\n"); + instQueue.deferMemAccessInst(inst); + continue; + } + if (inst->isDataPrefetch() || inst->isInstPrefetch()) { inst->fault = NoFault; } @@ -1516,6 +1525,9 @@ void DefaultIEW::tick() { + //To reset some lsq counter + ldstQueue.tick(); + wbNumInst = 0; wbCycle = 0; @@ -1541,6 +1553,9 @@ } if (exeStatus != Squashing) { + //load remaining split loads + ldstQueue.readPendingSplitLoad(); + executeInsts(); writebackInsts(); diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -195,6 +195,11 @@ */ DynInstPtr getDeferredMemInstToExecute(); + /** Returns a memory instruction that was deferred due to the lack of free + * ports. + */ + DynInstPtr getDeferredMemAccessInstToExecute(); + /** * Records the instruction as the producer of a register without * adding it to the rest of the IQ. @@ -244,6 +249,11 @@ */ void deferMemInst(DynInstPtr &deferred_inst); + /** + * Defers a memory instruction when there is lack of free cache ports. + */ + void deferMemAccessInst(DynInstPtr &deferred_inst); + /** Indicates an ordering violation between a store and a load. */ void violation(DynInstPtr &store, DynInstPtr &faulting_load); @@ -310,6 +320,11 @@ */ std::list deferredMemInsts; + /** List of instructions waiting for free cache ports. Their access to + * cache was denied at least one time for lack of free cache ports. + */ + std::list deferredMemAccessInsts; + /** * Struct for comparing entries to be added to the priority queue. * This gives reverse ordering to the instructions in terms of diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh --- a/src/cpu/o3/inst_queue_impl.hh +++ b/src/cpu/o3/inst_queue_impl.hh @@ -420,6 +420,7 @@ nonSpecInsts.clear(); listOrder.clear(); deferredMemInsts.clear(); + deferredMemAccessInsts.clear(); } template @@ -779,6 +780,16 @@ total_deferred_mem_issued++; } + //for adding mem instructions that were denied to access cache because of lack of free ports + //FIXME: should it before getDeferredMemInstToExecute()? + while (total_deferred_mem_issued < totalWidth && + (deferred_mem_inst = getDeferredMemAccessInstToExecute()) != 0) { + issueToExecuteQueue->access(0)->size++; + instsToExecute.push_back(deferred_mem_inst); + total_deferred_mem_issued++; + } + + // Have iterator to head of the list // While I haven't exceeded bandwidth or reached the end of the list, // Try to get a FU that can do what this op needs. @@ -931,7 +942,8 @@ // @todo If the way deferred memory instructions are handeled due to // translation changes then the deferredMemInsts condition should be removed // from the code below. - if (total_issued || total_deferred_mem_issued || deferredMemInsts.size()) { + if (total_issued || total_deferred_mem_issued || deferredMemInsts.size() + || deferredMemAccessInsts.size()) { cpu->activityThisCycle(); } else { DPRINTF(IQ, "Not able to schedule any instructions.\n"); @@ -1153,6 +1165,26 @@ template void +InstructionQueue::deferMemAccessInst(DynInstPtr &deferred_inst) +{ + deferredMemAccessInsts.push_back(deferred_inst); +} + +template +typename Impl::DynInstPtr +InstructionQueue::getDeferredMemAccessInstToExecute() +{ + for (ListIt it = deferredMemAccessInsts.begin(); it != deferredMemAccessInsts.end(); + ++it) { + DynInstPtr ret = *it; + deferredMemAccessInsts.erase(it); + return ret; + } + return NULL; +} + +template +void InstructionQueue::violation(DynInstPtr &store, DynInstPtr &faulting_load) { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -130,6 +130,11 @@ void writebackStores(ThreadID tid); /** + * Attempts to read a pending split load. + */ + void readPendingSplitLoad(); + + /** * Squash instructions from a thread until the specified sequence number. */ void squash(const InstSeqNum &squashed_num, ThreadID tid) @@ -307,6 +312,38 @@ /** The IEW stage pointer. */ IEW *iewStage; + /** The number of used cache read ports in this cycle. */ + int usedReadPorts; + + /** The number of used cache write ports in this cycle. */ + int usedWritePorts; + + /** The number of used cache read response ports in this cycle. */ + int usedReadRespPorts; + + /** The number of used cache write response ports in this cycle. */ + int usedWriteRespPorts; + + /** The number of cache read ports available each cycle. */ + int cacheReadPorts; + + /** The number of cache write ports available each cycle. */ + int cacheWritePorts; + + /** next cycle to recieve a resp from cache. */ + Tick nextRecvCycle; + + /** checks to see if there are any free cache read and write + * response ports left this cycle. */ + bool isRespBlocked(PacketPtr pkt); + + /** Whether or not there is a split load packet (second packet) that + * could not be sent because of a lack of cache read request ports. */ + bool hasPendingSplitPkt; + + /** The thread Id of the split load that pending free cache read request ports. */ + ThreadID pendingSplitThreadId; + protected: /** The LSQ policy for SMT mode. */ LSQPolicy lsqPolicy; @@ -334,6 +371,15 @@ /** The thread id of the LSQ Unit that is currently waiting for a * retry. */ ThreadID retryTid; + + private: + /** Number of times a read resp from cache was rejected due to + * the lack of free cache read ports */ + Stats::Scalar cacheReadRespPortBlocked; + + /** Number of times a write resp from cache was rejected due to + * the lack of free cache write ports */ + Stats::Scalar cacheWriteRespPortBlocked; }; template