# Node ID 81cb93c4a16caf4b421a1acf8ebb913ba6df9c96 # Parent 31c5786945b447b372c3b7d346aea8fa6208577c diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py --- a/src/mem/DRAMCtrl.py +++ b/src/mem/DRAMCtrl.py @@ -76,6 +76,14 @@ # bus in front of the controller for multiple ports port = SlavePort("Slave port") + # the unified queue holds all incoming memory requests to help with + # flow control and is searched by the memory controller when it's + # read or write queues have free space available + use_unified_buffer = Param.Bool(False, "Should we use a unified buffer?") + unified_buffer_size = Param.Unsigned(100, "Number of total requests") + unified_search_depth = Param.Unsigned(16, "Maximum number of entries " + "to search when looking for new requests") + # the basic configuration of the controller architecture, note # that each entry corresponds to a burst for the specific DRAM # configuration (e.g. x32 with burst length 8 is 32 bytes) and not diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh --- a/src/mem/dram_ctrl.hh +++ b/src/mem/dram_ctrl.hh @@ -484,6 +484,12 @@ EventWrapper respondEvent; /** + * Check the unified queue for any requests that may be able to be + * copied into the separated read/write queues. + */ + void processUnifiedQueue(); + + /** * Check if the read queue has room for more entries * * @param pktCount The number of entries needed in the read queue @@ -643,7 +649,20 @@ * * @return An address aligned to a DRAM burst */ - Addr burstAlign(Addr addr) const { return (addr & ~(Addr(burstSize - 1))); } + Addr burstAlign(Addr addr) const + { return (addr & ~(Addr(burstSize - 1))); } + + /** + * Unified queue to hold all outstanding memory requests. Issued requests + * are moved to a pending set to simplify searching of the queue. + * Requests are marked done once DRAM access is complete. After sending + * a response on the port, the request is finally removed from the queue. + */ + std::list unifiedQueue; + std::unordered_set inUnifiedQueue; + std::unordered_set unifiedPending; + + int unifiedQueueOccupancy(); /** * The controller's main read and write queues @@ -696,6 +715,9 @@ const uint32_t banksPerRank; const uint32_t channels; uint32_t rowsPerBank; + const bool useUnifiedBuffer; + const uint32_t unifiedBufferSize; + const uint32_t unifiedSearchDepth; const uint32_t readBufferSize; const uint32_t writeBufferSize; const uint32_t writeHighThreshold; @@ -817,6 +839,7 @@ Stats::Formula busUtilWrite; // Average queue lengths + Stats::Average avgUniQLen; Stats::Average avgRdQLen; Stats::Average avgWrQLen; @@ -886,6 +909,7 @@ Tick recvAtomic(PacketPtr pkt); void recvFunctional(PacketPtr pkt); bool recvTimingReq(PacketPtr pkt); + bool issueTimingReq(PacketPtr pkt); }; diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc --- a/src/mem/dram_ctrl.cc +++ b/src/mem/dram_ctrl.cc @@ -73,6 +73,9 @@ bankGroupsPerRank(p->bank_groups_per_rank), bankGroupArch(p->bank_groups_per_rank > 0), banksPerRank(p->banks_per_rank), channels(p->channels), rowsPerBank(0), + useUnifiedBuffer(p->use_unified_buffer), + unifiedBufferSize(p->unified_buffer_size), + unifiedSearchDepth(p->unified_search_depth), readBufferSize(p->read_buffer_size), writeBufferSize(p->write_buffer_size), writeHighThreshold(writeBufferSize * p->write_high_thresh_perc / 100.0), @@ -288,6 +291,67 @@ return latency; } +void +DRAMCtrl::processUnifiedQueue() +{ + // Fast path if there is nothing to do + if (unifiedQueue.empty()) { + return; + } + + // See how many reads and writes we can issue + int read_queue_space = readBufferSize + - (readQueue.size() + respQueue.size()); + int write_queue_space = writeBufferSize - writeQueue.size(); + int search_count = 0; + + for (auto iter = unifiedQueue.begin(); + iter != unifiedQueue.end(); ++iter) { + // Emulate a maximum number of requests we can search per cycle + if (++search_count > unifiedSearchDepth) { + break; + } + + PacketPtr pkt = (*iter); + unsigned size = pkt->getSize(); + unsigned offset = pkt->getAddr() & (burstSize - 1); + unsigned int dram_pkt_count = divCeil(offset + size, burstSize); + + // Look for as many requests up to the maximum search depth. Note + // that write merging and read serviings by write queue are not + // handled here and left to the split read and write queues. + if (pkt->isRead() && read_queue_space >= dram_pkt_count) { + DPRINTF(DRAM, "Read addr %lld moved from unified queue\n", + pkt->getAddr()); + inUnifiedQueue.erase(burstAlign(pkt->getAddr())); + unifiedPending.insert(burstAlign(pkt->getAddr())); + iter = unifiedQueue.erase(iter); + read_queue_space -= dram_pkt_count; + issueTimingReq(pkt); + } else if (pkt->isWrite() && write_queue_space >= dram_pkt_count) { + DPRINTF(DRAM, "Write addr %lld moved from unified queue\n", + pkt->getAddr()); + inUnifiedQueue.erase(burstAlign(pkt->getAddr())); + unifiedPending.insert(burstAlign(pkt->getAddr())); + iter = unifiedQueue.erase(iter); + write_queue_space -= dram_pkt_count; + issueTimingReq(pkt); + } + + // make sure we don't enqueue too many requests or invalid types + DPRINTF(DRAM, "Unified queue occupancy is %d of %d\n", + unifiedQueueOccupancy(), unifiedBufferSize); + assert(unifiedQueueOccupancy() <= unifiedBufferSize); + assert(pkt->isRead() || pkt->isWrite()); + } +} + +int +DRAMCtrl::unifiedQueueOccupancy() +{ + return unifiedQueue.size() + unifiedPending.size(); +} + bool DRAMCtrl::readQueueFull(unsigned int neededEntries) const { @@ -605,7 +669,51 @@ } prevArrival = curTick(); + // If we are using a unified buffer, place requests in it and + // check if space is available in the separate read/write buffers. + // Otherwise, send directly to the read/write buffers and use retries. + bool rv = true; + if (useUnifiedBuffer && (pkt->isRead() || pkt->isWrite())) { + // See if we can enqueue directly to internal queues. Otherwise, + // place in unified queue to be searched later. + unsigned size = pkt->getSize(); + unsigned offset = pkt->getAddr() & (burstSize - 1); + unsigned int dram_pkt_count = divCeil(offset + size, burstSize); + + if ((pkt->isRead() && !readQueueFull(dram_pkt_count)) || + (pkt->isWrite() && !writeQueueFull(dram_pkt_count))) { + DPRINTF(DRAM, "Request addr %lld skipping unified queue\n", + pkt->getAddr()); + + // separate issued requests to simplify unified queue searching + unifiedPending.insert(burstAlign(pkt->getAddr())); + + bool M5_VAR_USED issue_success = issueTimingReq(pkt); + assert(issue_success); + } else { + DPRINTF(DRAM, "Request addr %lld placed in unified queue\n", + pkt->getAddr()); + + // place in queue to be searched once a request completes + unifiedQueue.push_back(pkt); + inUnifiedQueue.insert(burstAlign(pkt->getAddr())); + } + + avgUniQLen = unifiedQueueOccupancy(); + + // this should not happen if flow control is managed by the port + assert(unifiedQueueOccupancy() <= unifiedBufferSize); + } else { + rv = issueTimingReq(pkt); + } + + return rv; +} + +bool +DRAMCtrl::issueTimingReq(PacketPtr pkt) +{ // Find out how many dram packets a pkt translates to // If the burst size is equal or larger than the pkt size, then a pkt // translates to only one dram packet. Otherwise, a pkt translates to @@ -683,13 +791,20 @@ } else { // if there is nothing left in any queue, signal a drain if (drainState() == DrainState::Draining && - writeQueue.empty() && readQueue.empty()) { + writeQueue.empty() && readQueue.empty() && + unifiedQueueOccupancy() == 0) { DPRINTF(Drain, "DRAM controller done draining\n"); signalDrainDone(); } } + // If we have space available in the read queue, see if we can + // insert requests from the unified queue, if used. + if (useUnifiedBuffer) { + processUnifiedQueue(); + } + // We have made a location in the queue available at this point, // so if there is a read that was forced to wait, retry now if (retryRdReq) { @@ -840,6 +955,14 @@ { DPRINTF(DRAM, "Responding to Address %lld.. ",pkt->getAddr()); + // if we are using a unified queue, this request can be removed now + if (useUnifiedBuffer && (pkt->isRead() || pkt->isWrite())) { + DPRINTF(DRAM, "Removing addr %lld from unified queue\n", + pkt->getAddr()); + assert(unifiedPending.count(burstAlign(pkt->getAddr()))); + unifiedPending.erase(burstAlign(pkt->getAddr())); + } + bool needsResponse = pkt->needsResponse(); // do the actual memory access which also turns the packet into a // response @@ -1422,6 +1545,12 @@ if (!nextReqEvent.scheduled()) schedule(nextReqEvent, std::max(nextReqTime, curTick())); + // If there is space available writes are available in the unified + // queue, try to place them into the write queue now. + if (useUnifiedBuffer && writeQueue.size() < writeBufferSize) { + processUnifiedQueue(); + } + // If there is space available and we have writes waiting then let // them retry. This is done here to ensure that the retry does not // cause a nextReqEvent to be scheduled before we do so as part of @@ -1931,6 +2060,11 @@ .name(name() + ".perBankWrBursts") .desc("Per bank write bursts"); + avgUniQLen + .name(name() + ".avgUniQLen") + .desc("Average number of requests in unified queue") + .precision(3); + avgRdQLen .name(name() + ".avgRdQLen") .desc("Average read queue length when enqueuing")