diff -r 10647f5d0f7f -r 8492ca5ea301 src/mem/dram_ctrl.cc
--- a/src/mem/dram_ctrl.cc	Thu Feb 04 16:57:59 2016 -0600
+++ b/src/mem/dram_ctrl.cc	Mon Feb 08 13:55:00 2016 -0600
@@ -58,9 +58,10 @@
 DRAMCtrl::DRAMCtrl(const DRAMCtrlParams* p) :
     AbstractMemory(p),
     port(name() + ".port", *this), isTimingMode(false),
+    requestPortBlocked(false), responsePortBlocked(false),
     retryRdReq(false), retryWrReq(false),
     busState(READ),
-    nextReqEvent(this), respondEvent(this),
+    nextReqEvent(this), respondEvent(this), backendEvent(this),
     deviceSize(p->device_size),
     deviceBusWidth(p->device_bus_width), burstLength(p->burst_length),
     deviceRowBufferSize(p->device_rowbuffer_size),
@@ -75,6 +76,8 @@
     banksPerRank(p->banks_per_rank), channels(p->channels), rowsPerBank(0),
     readBufferSize(p->read_buffer_size),
     writeBufferSize(p->write_buffer_size),
+    totalBuffers(p->read_buffer_size + p->write_buffer_size +
+                 p->response_buffer_size),
     writeHighThreshold(writeBufferSize * p->write_high_thresh_perc / 100.0),
     writeLowThreshold(writeBufferSize * p->write_low_thresh_perc / 100.0),
     minWritesPerSwitch(p->min_writes_per_switch),
@@ -286,6 +289,17 @@
 }
 
 bool
+DRAMCtrl::buffersFull(unsigned int needed_entries) const
+{
+    unsigned int currently_used = readQueue.size() + writeQueue.size() +
+                                  respQueue.size() + backendQueue.size();
+    DPRINTF(DRAM, "Total buffer limit %d, current size %d, entries needed %d\n",
+            totalBuffers, currently_used, needed_entries);
+
+    return (currently_used + needed_entries) > totalBuffers;
+}
+
+bool
 DRAMCtrl::readQueueFull(unsigned int neededEntries) const
 {
     DPRINTF(DRAM, "Read queue limit %d, current size %d, entries needed %d\n",
@@ -596,13 +610,6 @@
         return true;
     }
 
-    // Calc avg gap between requests
-    if (prevArrival != 0) {
-        totGap += curTick() - prevArrival;
-    }
-    prevArrival = curTick();
-
-
     // Find out how many dram packets a pkt translates to
     // If the burst size is equal or larger than the pkt size, then a pkt
     // translates to only one dram packet. Otherwise, a pkt translates to
@@ -611,6 +618,20 @@
     unsigned offset = pkt->getAddr() & (burstSize - 1);
     unsigned int dram_pkt_count = divCeil(offset + size, burstSize);
 
+    // Check if there is enough total buffering to accept this packet
+    if (buffersFull(dram_pkt_count)) {
+        DPRINTF(DRAM, "    Controller buffers full. Blocking requests...\n");
+        requestPortBlocked = true;
+        numBufsFullRetry++;
+        return false;
+    }
+
+    // Calc avg gap between requests
+    if (prevArrival != 0) {
+        totGap += curTick() - prevArrival;
+    }
+    prevArrival = curTick();
+
     // check local buffers and do not accept if full
     if (pkt->isRead()) {
         assert(size != 0);
@@ -648,6 +669,19 @@
 }
 
 void
+DRAMCtrl::recvRespRetry()
+{
+    assert(responsePortBlocked);
+    responsePortBlocked = false;
+
+    assert(!backendEvent.scheduled());
+    BufferEntryBase *queue_head = backendQueue.top();
+    assert(queue_head->readyTime <= curTick());
+    // Backend queue is operated at the memory controller's frequency
+    schedule(backendEvent, nextCycle());
+}
+
+void
 DRAMCtrl::processRespondEvent()
 {
     DPRINTF(DRAM,
@@ -679,6 +713,7 @@
     if (!respQueue.empty()) {
         assert(respQueue.front()->readyTime >= curTick());
         assert(!respondEvent.scheduled());
+        assert(respQueue.front()->readyTime >= curTick());
         schedule(respondEvent, respQueue.front()->readyTime);
     } else {
         // if there is nothing left in any queue, signal a drain
@@ -838,7 +873,7 @@
 void
 DRAMCtrl::accessAndRespond(PacketPtr pkt, Tick static_latency)
 {
-    DPRINTF(DRAM, "Responding to Address %lld.. ",pkt->getAddr());
+    DPRINTF(DRAM, "Scheduling response for %lld...\n", pkt->getAddr());
 
     bool needsResponse = pkt->needsResponse();
     // do the actual memory access which also turns the packet into a
@@ -853,14 +888,20 @@
         // with headerDelay that takes into account the delay provided by
         // the xbar and also the payloadDelay that takes into account the
         // number of data beats.
-        Tick response_time = curTick() + static_latency + pkt->headerDelay +
-                             pkt->payloadDelay;
+        // NOTE: Unlike other queues, the backend queue and port are operated
+        // at the frequency of the memory controller
+        Tick response_time = clockEdge(ticksToCycles(static_latency +
+                                                     pkt->headerDelay +
+                                                     pkt->payloadDelay));
         // Here we reset the timing of the packet before sending it out.
         pkt->headerDelay = pkt->payloadDelay = 0;
 
-        // queue the packet in the response queue to be sent out after
+        // queue the packet in the back-end queue to be sent out after
         // the static latency has passed
-        port.schedTimingResp(pkt, response_time, true);
+        backendQueue.push(new BufferEntryBase(pkt, response_time));
+        if (!backendEvent.scheduled() && !responsePortBlocked) {
+            schedule(backendEvent, response_time);
+        }
     } else {
         // @todo the packet is going to be deleted, and the DRAMPacket
         // is still having a pointer to it
@@ -873,6 +914,44 @@
 }
 
 void
+DRAMCtrl::serviceBackendQueue()
+{
+    assert(!responsePortBlocked);
+
+    BufferEntryBase *queue_head = backendQueue.top();
+    PacketPtr pkt = queue_head->pkt;
+    Addr target_addr = pkt->getAddr();
+    if (!port.sendTimingResp(pkt)) {
+        DPRINTF(DRAM, "Response port blocked\n");
+        responsePortBlocked = true;
+    } else {
+        DPRINTF(DRAM, "Sent response packet %s for 0x%0x\n", pkt->cmdString(),
+                target_addr);
+        backendQueue.pop();
+        delete queue_head;
+
+        if (backendQueue.size() > 0) {
+            queue_head = backendQueue.top();
+            Tick next_ready = std::max(queue_head->readyTime, nextCycle());
+            assert(!backendEvent.scheduled() && !responsePortBlocked);
+            schedule(backendEvent, next_ready);
+        }
+
+        if (!respQueue.empty() && !respondEvent.scheduled()) {
+            DRAMPacket *resp_queue_head = respQueue.front();
+            Tick next_ready = std::max(resp_queue_head->readyTime, nextCycle());
+            schedule(respondEvent, next_ready);
+        }
+
+        if (requestPortBlocked) {
+            DPRINTF(DRAM, "    Unblocking requests\n");
+            requestPortBlocked = false;
+            port.sendRetryReq();
+        }
+    }
+}
+
+void
 DRAMCtrl::activateBank(Rank& rank_ref, Bank& bank_ref,
                        Tick act_tick, uint32_t row)
 {
@@ -1975,6 +2054,10 @@
 
     avgMemAccLat = totMemAccLat / (readBursts - servicedByWrQ);
 
+    numBufsFullRetry
+        .name(name() + ".numBufsFullRetry")
+        .desc("Number of times all buffers full causing retry");
+
     numRdRetry
         .name(name() + ".numRdRetry")
         .desc("Number of times read queue was full causing retry");
@@ -2197,8 +2280,7 @@
 }
 
 DRAMCtrl::MemoryPort::MemoryPort(const std::string& name, DRAMCtrl& _memory)
-    : QueuedSlavePort(name, &_memory, queue), queue(_memory, *this),
-      memory(_memory)
+    : SlavePort(name, &_memory), memory(_memory)
 { }
 
 AddrRangeList
@@ -2214,12 +2296,10 @@
 {
     pkt->pushLabel(memory.name());
 
-    if (!queue.checkFunctional(pkt)) {
-        // Default implementation of SimpleTimingPort::recvFunctional()
-        // calls recvAtomic() and throws away the latency; we can save a
-        // little here by just not calculating the latency.
-        memory.recvFunctional(pkt);
-    }
+    // Default implementation of SimpleTimingPort::recvFunctional()
+    // calls recvAtomic() and throws away the latency; we can save a
+    // little here by just not calculating the latency.
+    memory.recvFunctional(pkt);
 
     pkt->popLabel();
 }
@@ -2237,6 +2317,12 @@
     return memory.recvTimingReq(pkt);
 }
 
+void
+DRAMCtrl::MemoryPort::recvRespRetry()
+{
+    memory.recvRespRetry();
+}
+
 DRAMCtrl*
 DRAMCtrlParams::create()
 {
diff -r 10647f5d0f7f -r 8492ca5ea301 src/mem/DRAMCtrl.py
--- a/src/mem/DRAMCtrl.py	Thu Feb 04 16:57:59 2016 -0600
+++ b/src/mem/DRAMCtrl.py	Mon Feb 08 13:55:00 2016 -0600
@@ -82,6 +82,8 @@
     # the cacheline size or request/packet size
     write_buffer_size = Param.Unsigned(64, "Number of write queue entries")
     read_buffer_size = Param.Unsigned(32, "Number of read queue entries")
+    response_buffer_size = Param.Unsigned(32, "Number of response queue "
+                                          "entries")
 
     # threshold in percent for when to forcefully trigger writes and
     # start emptying the write buffer
diff -r 10647f5d0f7f -r 8492ca5ea301 src/mem/dram_ctrl.hh
--- a/src/mem/dram_ctrl.hh	Thu Feb 04 16:57:59 2016 -0600
+++ b/src/mem/dram_ctrl.hh	Mon Feb 08 13:55:00 2016 -0600
@@ -52,6 +52,7 @@
 #define __MEM_DRAM_CTRL_HH__
 
 #include <deque>
+#include <queue>
 #include <string>
 #include <unordered_set>
 
@@ -93,10 +94,9 @@
 
     // For now, make use of a queued slave port to avoid dealing with
     // flow control for the responses being sent back
-    class MemoryPort : public QueuedSlavePort
+    class MemoryPort : public SlavePort
     {
 
-        RespPacketQueue queue;
         DRAMCtrl& memory;
 
       public:
@@ -111,6 +111,8 @@
 
         bool recvTimingReq(PacketPtr);
 
+        void recvRespRetry();
+
         virtual AddrRangeList getAddrRanges() const;
 
     };
@@ -127,7 +129,17 @@
     bool isTimingMode;
 
     /**
-     * Remember if we have to retry a request when available.
+     * Bools to track whether the port is blocked from receiving requests
+     * because total buffering is full (and thus, needs to send a retry to the
+     * requester when buffer space is available again), and to track when the
+     * response port gets blocked.
+     */
+    bool requestPortBlocked;
+    bool responsePortBlocked;
+
+    /**
+     * Remember the type of request we have to retry when read or write buffer
+     * space is available, respectively.
      */
     bool retryRdReq;
     bool retryWrReq;
@@ -408,10 +420,12 @@
     };
 
     /**
-     * A DRAM packet stores packets along with the timestamp of when
-     * the packet entered the queue, and also the decoded address.
+     * The base class for buffer entries in the controller. These track
+     * the request packet and timestamp of when a packet entered a queue.
+     * BufferEntryBase is used to wrap packets queued in the backend queue,
+     * since they only need the packet pointer and timing info.
      */
-    class DRAMPacket {
+    class BufferEntryBase {
 
       public:
 
@@ -424,6 +438,25 @@
         /** This comes from the outside world */
         const PacketPtr pkt;
 
+        BufferEntryBase(PacketPtr _pkt, Tick ready_time)
+            : entryTime(curTick()), readyTime(ready_time), pkt(_pkt)
+        { }
+
+        bool operator<(const BufferEntryBase& pkt_wrap)
+        { return readyTime < pkt_wrap.readyTime; }
+    };
+
+    /**
+     * A DRAM packet stores packets along with the timestamp of when
+     * the packet entered the queue, so it inherits these from
+     * BufferEntryBase, and adds decoded DRAM access information like
+     * the type and size of the access, and rank/bank/row information. These
+     * are queued in the read, write, and resp queues.
+     */
+    class DRAMPacket : public BufferEntryBase {
+
+      public:
+
         const bool isRead;
 
         /** Will be populated by address decoder */
@@ -463,10 +496,10 @@
         DRAMPacket(PacketPtr _pkt, bool is_read, uint8_t _rank, uint8_t _bank,
                    uint32_t _row, uint16_t bank_id, Addr _addr,
                    unsigned int _size, Bank& bank_ref, Rank& rank_ref)
-            : entryTime(curTick()), readyTime(curTick()),
-              pkt(_pkt), isRead(is_read), rank(_rank), bank(_bank), row(_row),
-              bankId(bank_id), addr(_addr), size(_size), burstHelper(NULL),
-              bankRef(bank_ref), rankRef(rank_ref)
+            : BufferEntryBase(_pkt, curTick()), isRead(is_read), rank(_rank),
+              bank(_bank), row(_row), bankId(bank_id), addr(_addr),
+              size(_size), burstHelper(NULL), bankRef(bank_ref),
+              rankRef(rank_ref)
         { }
 
     };
@@ -484,6 +517,14 @@
     EventWrapper<DRAMCtrl, &DRAMCtrl::processRespondEvent> respondEvent;
 
     /**
+     * Check if all buffers are full
+     *
+     * @param pktCount The number of buffers entries needed
+     * @return true if buffers would overfill, false otherwise
+     */
+    bool buffersFull(unsigned int pktCount) const;
+
+    /**
      * Check if the read queue has room for more entries
      *
      * @param pktCount The number of entries needed in the read queue
@@ -552,6 +593,15 @@
     void accessAndRespond(PacketPtr pkt, Tick static_latency);
 
     /**
+     * When the backend queue has response packets to be sent back to
+     * requesters, the backendEvent is scheduled and calls
+     * serviceBackendQueue(). If the response port unblocks backend queue
+     * space, then this function wakes up the appropriate waiting queues.
+     */
+    void serviceBackendQueue();
+    EventWrapper<DRAMCtrl, &DRAMCtrl::serviceBackendQueue> backendEvent;
+
+    /**
      * Address decoder to figure out physical mapping onto ranks,
      * banks, and rows. This function is called multiple times on the same
      * system packet if the pakcet is larger than burst of the memory. The
@@ -671,6 +721,27 @@
     std::deque<DRAMPacket*> respQueue;
 
     /**
+     * A comparator to sort BufferEntryBase types based on their ready times.
+     * This is currently used to sort the backend queue.
+     */
+    struct rqCompare {
+        bool operator() (BufferEntryBase *&lhs, BufferEntryBase *&rhs) const
+        {
+            return lhs->readyTime > rhs->readyTime;
+        }
+    };
+
+    /**
+     * The backend queue is a priority queue that sorts response packets on
+     * when they become available to send to through the port (e.g. read
+     * packets traverse the read queue, access memory, and incur backend
+     * latency, while writes only incur frontend latency, so they might be
+     * sent before reads started at the same time).
+     */
+    std::priority_queue<BufferEntryBase*, std::vector<BufferEntryBase*>,
+                        rqCompare > backendQueue;
+
+    /**
      * Vector of ranks
      */
     std::vector<Rank*> ranks;
@@ -698,6 +769,7 @@
     uint32_t rowsPerBank;
     const uint32_t readBufferSize;
     const uint32_t writeBufferSize;
+    const uint32_t totalBuffers;
     const uint32_t writeHighThreshold;
     const uint32_t writeLowThreshold;
     const uint32_t minWritesPerSwitch;
@@ -785,6 +857,7 @@
     Stats::Scalar neitherReadNorWrite;
     Stats::Vector perBankRdBursts;
     Stats::Vector perBankWrBursts;
+    Stats::Scalar numBufsFullRetry;
     Stats::Scalar numRdRetry;
     Stats::Scalar numWrRetry;
     Stats::Scalar totGap;
@@ -886,6 +959,7 @@
     Tick recvAtomic(PacketPtr pkt);
     void recvFunctional(PacketPtr pkt);
     bool recvTimingReq(PacketPtr pkt);
+    void recvRespRetry();
 
 };