# Node ID 8a1419dbbfa65b61cdd341905f3f919a7d885440
# Parent  df24b9af42c72606f1fa8e5aa0502b53e81ea176
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -670,6 +670,18 @@
     // the delay provided by the crossbar
     Tick forward_time = clockEdge(forwardLatency) + pkt->headerDelay;
 
+    if (pkt->cmd == MemCmd::LockedRMWWriteReq) {
+        // For LockedRMW accesses, we mark the block inaccessible after the
+        // read (see below), to make sure no one gets in before the write.
+        // Now that the write is here, mark it accessible again, so the
+        // write will succeed.  LockedRMWReadReq brings the block in in
+        // exclusive mode, so we know it was previously writable.
+        CacheBlk *blk = tags->findBlock(pkt->getAddr(), pkt->isSecure());
+        assert(blk && blk->isValid());
+        assert(!blk->isReadable() && !blk->isWritable());
+        blk->status |= (BlkReadable | BlkWritable);
+    }
+
     // We use lookupLatency here because it is used to specify the latency
     // to access.
     Cycles lat = lookupLatency;
@@ -718,6 +730,56 @@
                 next_pf_time = prefetcher->notify(pkt);
         }
 
+        // handle special cases for LockedRMW transactions
+        if (pkt->isLockedRMW()) {
+            Addr blk_addr = blockAlign(pkt->getAddr());
+
+            if (pkt->isRead()) {
+                // Read hit for LockedRMW.  Since it requires exclusive
+                // permissions, there should be no outstanding access.
+                assert(!mshrQueue.findMatch(blk_addr, pkt->isSecure()));
+                // The keys to LockedRMW are that (1) we always have an MSHR
+                // allocated during the RMW interval to catch snoops and
+                // defer them until after the RMW completes, and (2) we
+                // clear permissions on the block to turn any upstream
+                // access other than the matching write into a miss, causing
+                // it to append to the MSHR as well.
+
+                // Because we hit in the cache, we have to fake an MSHR to
+                // achieve part (1).  If the read had missed, this MSHR
+                // would get allocated as part of normal miss processing.
+                // Basically we need to get the MSHR in the same state as if
+                // we had missed and just received the response.
+                Request *req2 = new Request(*(pkt->req));
+                Packet *pkt2 = new Packet(req2, pkt->cmd);
+                MSHR *mshr = allocateMissBuffer(pkt2, curTick(), true);
+                // Mark the MSHR "in service" (even though it's not) to prevent
+                // the cache from sending out a request.
+                mshrQueue.markInService(mshr, false);
+                // Part (2): mark block inaccessible
+                assert(blk);
+                blk->status &= ~(BlkReadable | BlkWritable);
+            } else {
+                assert(pkt->isWrite());
+                // All LockedRMW writes come here, as they cannot miss.
+                // Need to undo the two things described above.  Block
+                // permissions were already restored earlier in this
+                // function, prior to the access() call.  Now we just need
+                // to clear out the MSHR.
+
+                // Read should have already allocated MSHR.
+                MSHR *mshr = mshrQueue.findMatch(blk_addr, pkt->isSecure());
+                assert(mshr);
+                // Fake up a packet and "respond" to the still-pending
+                // LockedRMWRead, to process any pending targets and clear
+                // out the MSHR
+                Packet *resp_pkt =
+                    new Packet(pkt->req, MemCmd::LockedRMWWriteResp);
+                resp_pkt->senderState = mshr;
+                recvTimingResp(resp_pkt);
+            }
+        }
+
         if (needsResponse) {
             pkt->makeTimingResponse();
             // @todo: Make someone pay for this
@@ -750,6 +812,9 @@
         MSHR *mshr = pkt->req->isUncacheable() ? nullptr :
             mshrQueue.findMatch(blk_addr, pkt->isSecure());
 
+        // See note above: these should always hit
+        assert(pkt->cmd != MemCmd::LockedRMWWriteReq);
+
         // Software prefetch handling:
         // To keep the core from waiting on data it won't look at
         // anyway, send back a response with dummy data. Miss handling
@@ -1290,7 +1355,7 @@
 
     // if this is a write, we should be looking at an uncacheable
     // write
-    if (pkt->isWrite()) {
+    if (pkt->isWrite() && pkt->cmd != MemCmd::LockedRMWWriteResp) {
         assert(pkt->req->isUncacheable());
         handleUncacheableWriteResp(pkt);
         return;
@@ -1354,7 +1419,26 @@
     // First offset for critical word first calculations
     int initial_offset = initial_tgt->pkt->getOffset(blkSize);
 
-    while (mshr->hasTargets()) {
+    if (pkt->cmd == MemCmd::LockedRMWWriteResp) {
+        // This is the fake response generated by the write half of the RMW;
+        // see comments in recvTimingReq().  The first target on the list
+        // should be the LockedRMWReadReq which has already been satisfied,
+        // either because it was a hit (and the MSHR was allocated in
+        // recvTimingReq()) or because it was left there after the inital
+        // response (using the 'early_exit' flag below).  In either case, we
+        // don't need to respond now, so pop it off to prevent the loop
+        // below from generating another response.
+        assert(initial_tgt->pkt->cmd == MemCmd::LockedRMWReadReq);
+        delete initial_tgt->pkt->req;
+        delete initial_tgt->pkt;
+        mshr->popTarget();
+        initial_tgt = nullptr;
+    }
+
+    // Early exit flag for LockedRMWRead
+    bool early_exit = false;
+
+    while (mshr->hasTargets() && !early_exit) {
         MSHR::Target *target = mshr->getTarget();
         Packet *tgt_pkt = target->pkt;
 
@@ -1419,6 +1503,22 @@
                 assert(tgt_pkt->req->masterId() < system->maxMasters());
                 missLatency[tgt_pkt->cmdToIndex()][tgt_pkt->req->masterId()] +=
                     completion_time - target->recvTime;
+
+                if (tgt_pkt->cmd == MemCmd::LockedRMWReadReq) {
+                    // We're going to leave a target in the MSHR until the
+                    // write half of the RMW occurs (see comments above in
+                    // recvTimingReq()).  Since we'll be using the current
+                    // request packet (which has the allocated data pointer)
+                    // to form the response, we have to allocate a new dummy
+                    // packet to save in the MSHR target.
+                    Request *req = new Request(*(tgt_pkt->req));
+                    target->pkt = new Packet(req, tgt_pkt->cmd);
+                    // skip the rest of target processing after we
+                    // send the response
+                    early_exit = true;
+                    // Mark block inaccessible until write arrives
+                    blk->status &= ~(BlkReadable | BlkWritable);
+                }
             } else if (pkt->cmd == MemCmd::UpgradeFailResp) {
                 // failed StoreCond upgrade
                 assert(tgt_pkt->cmd == MemCmd::StoreCondReq ||
@@ -1430,6 +1530,11 @@
                 completion_time += clockEdge(responseLatency) +
                     pkt->payloadDelay;
                 tgt_pkt->req->setExtraData(0);
+            } else if (pkt->cmd == MemCmd::LockedRMWWriteResp) {
+                // Fake response on LockedRMW completion, see above.
+                // Since the data is already in the cache, we just use
+                // responseLatency with no extra penalties.
+                completion_time = clockEdge(responseLatency);
             } else {
                 // not a cache fill, just forwarding response
                 // responseLatency is the latency of the return path
@@ -1483,73 +1588,78 @@
             panic("Illegal target->source enum %d\n", target->source);
         }
 
-        mshr->popTarget();
+        if (!early_exit)
+            mshr->popTarget();
     }
 
-    if (blk && blk->isValid()) {
-        // an invalidate response stemming from a write line request
-        // should not invalidate the block, so check if the
-        // invalidation should be discarded
-        if (is_invalidate || mshr->hasPostInvalidate()) {
-            invalidateBlock(blk);
-        } else if (mshr->hasPostDowngrade()) {
-            blk->status &= ~BlkWritable;
+    if (!early_exit) {
+        if (blk && blk->isValid()) {
+            // an invalidate response stemming from a write line request
+            // should not invalidate the block, so check if the
+            // invalidation should be discarded
+            if (is_invalidate || mshr->hasPostInvalidate()) {
+                invalidateBlock(blk);
+            } else if (mshr->hasPostDowngrade()) {
+                blk->status &= ~BlkWritable;
+            }
+        }
+
+        if (mshr->promoteDeferredTargets()) {
+            // avoid later read getting stale data while write miss is
+            // outstanding.. see comment in timingAccess()
+            if (blk) {
+                blk->status &= ~BlkReadable;
+            }
+            mshrQueue.markPending(mshr);
+            schedMemSideSendEvent(clockEdge() + pkt->payloadDelay);
+        } else {
+            mshrQueue.deallocate(mshr);
+            if (wasFull && !mshrQueue.isFull()) {
+                clearBlocked(Blocked_NoMSHRs);
+            }
+
+            // Request the bus for a prefetch if this deallocation freed enough
+            // MSHRs for a prefetch to take place
+            if (prefetcher && mshrQueue.canPrefetch()) {
+                Tick next_pf_time =
+                    std::max(prefetcher->nextPrefetchReadyTime(), clockEdge());
+                if (next_pf_time != MaxTick)
+                    schedMemSideSendEvent(next_pf_time);
+            }
+        }
+
+        // if we used temp block, check to see if its valid and then
+        // clear it out
+        if (blk == tempBlock && tempBlock->isValid()) {
+            // We use forwardLatency here because we are copying
+            // Writebacks/CleanEvicts to write buffer. It specifies
+            // the latency to allocate an internal buffer and to
+            // schedule an event to the queued port.
+            if (blk->isDirty() || writebackClean) {
+                PacketPtr wbPkt = writebackBlk(blk);
+                allocateWriteBuffer(wbPkt, forward_time);
+                // Set BLOCK_CACHED flag if cached above.
+                if (isCachedAbove(wbPkt))
+                    wbPkt->setBlockCached();
+            } else {
+                PacketPtr wcPkt = cleanEvictBlk(blk);
+                // Check to see if block is cached above. If not allocate
+                // write buffer
+                if (isCachedAbove(wcPkt))
+                    delete wcPkt;
+                else
+                    allocateWriteBuffer(wcPkt, forward_time);
+            }
+            blk->invalidate();
         }
     }
 
-    if (mshr->promoteDeferredTargets()) {
-        // avoid later read getting stale data while write miss is
-        // outstanding.. see comment in timingAccess()
-        if (blk) {
-            blk->status &= ~BlkReadable;
-        }
-        mshrQueue.markPending(mshr);
-        schedMemSideSendEvent(clockEdge() + pkt->payloadDelay);
-    } else {
-        mshrQueue.deallocate(mshr);
-        if (wasFull && !mshrQueue.isFull()) {
-            clearBlocked(Blocked_NoMSHRs);
-        }
-
-        // Request the bus for a prefetch if this deallocation freed enough
-        // MSHRs for a prefetch to take place
-        if (prefetcher && mshrQueue.canPrefetch()) {
-            Tick next_pf_time = std::max(prefetcher->nextPrefetchReadyTime(),
-                                         clockEdge());
-            if (next_pf_time != MaxTick)
-                schedMemSideSendEvent(next_pf_time);
-        }
-    }
     // reset the xbar additional timinig  as it is now accounted for
     pkt->headerDelay = pkt->payloadDelay = 0;
 
     // copy writebacks to write buffer
     doWritebacks(writebacks, forward_time);
 
-    // if we used temp block, check to see if its valid and then clear it out
-    if (blk == tempBlock && tempBlock->isValid()) {
-        // We use forwardLatency here because we are copying
-        // Writebacks/CleanEvicts to write buffer. It specifies the latency to
-        // allocate an internal buffer and to schedule an event to the
-        // queued port.
-        if (blk->isDirty() || writebackClean) {
-            PacketPtr wbPkt = writebackBlk(blk);
-            allocateWriteBuffer(wbPkt, forward_time);
-            // Set BLOCK_CACHED flag if cached above.
-            if (isCachedAbove(wbPkt))
-                wbPkt->setBlockCached();
-        } else {
-            PacketPtr wcPkt = cleanEvictBlk(blk);
-            // Check to see if block is cached above. If not allocate
-            // write buffer
-            if (isCachedAbove(wcPkt))
-                delete wcPkt;
-            else
-                allocateWriteBuffer(wcPkt, forward_time);
-        }
-        blk->invalidate();
-    }
-
     DPRINTF(CacheVerbose, "Leaving %s with %s for addr %#llx\n", __func__,
             pkt->cmdString(), pkt->getAddr());
     delete pkt;
diff --git a/src/mem/cache/mshr.hh b/src/mem/cache/mshr.hh
--- a/src/mem/cache/mshr.hh
+++ b/src/mem/cache/mshr.hh
@@ -124,7 +124,7 @@
         const Tick recvTime;  //!< Time when request was received (for stats)
         const Tick readyTime; //!< Time when request is ready to be serviced
         const Counter order;  //!< Global order (for memory consistency mgmt)
-        const PacketPtr pkt;  //!< Pending request packet.
+        PacketPtr pkt;  //!< Pending request packet.
         const Source source;  //!< Request from cpu, memory, or prefetcher?
         const bool markedPending; //!< Did we mark upstream MSHR
                                   //!< as downstreamPending?
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -107,6 +107,10 @@
         StoreCondReq,
         StoreCondFailReq,       // Failed StoreCondReq in MSHR (never sent)
         StoreCondResp,
+        LockedRMWReadReq,
+        LockedRMWReadResp,
+        LockedRMWWriteReq,
+        LockedRMWWriteResp,
         SwapReq,
         SwapResp,
         MessageReq,
@@ -147,6 +151,7 @@
         IsSWPrefetch,
         IsHWPrefetch,
         IsLlsc,         //!< Alpha/MIPS LL or SC access
+        IsLockedRMW,    //!< x86 locked RMW access
         HasData,        //!< There is an associated payload
         IsError,        //!< Error response
         IsPrint,        //!< Print state matching address (for debugging)
@@ -207,6 +212,7 @@
      */
     bool hasData() const        { return testCmdAttrib(HasData); }
     bool isLLSC() const         { return testCmdAttrib(IsLlsc); }
+    bool isLockedRMW() const    { return testCmdAttrib(IsLockedRMW); }
     bool isSWPrefetch() const   { return testCmdAttrib(IsSWPrefetch); }
     bool isHWPrefetch() const   { return testCmdAttrib(IsHWPrefetch); }
     bool isPrefetch() const     { return testCmdAttrib(IsSWPrefetch) ||
@@ -522,6 +528,7 @@
         return resp_cmd.hasData();
     }
     bool isLLSC() const              { return cmd.isLLSC(); }
+    bool isLockedRMW() const         { return cmd.isLockedRMW(); }
     bool isError() const             { return cmd.isError(); }
     bool isPrint() const             { return cmd.isPrint(); }
     bool isFlush() const             { return cmd.isFlush(); }
@@ -779,6 +786,8 @@
             return MemCmd::LoadLockedReq;
         else if (req->isPrefetch())
             return MemCmd::SoftPFReq;
+        else if (req->isLockedRMW())
+            return MemCmd::LockedRMWReadReq;
         else
             return MemCmd::ReadReq;
     }
@@ -793,6 +802,8 @@
             return MemCmd::StoreCondReq;
         else if (req->isSwap())
             return MemCmd::SwapReq;
+        else if (req->isLockedRMW())
+            return MemCmd::LockedRMWWriteReq;
         else
             return MemCmd::WriteReq;
     }
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -159,6 +159,18 @@
     /* StoreCondResp */
     { SET3(IsWrite, IsLlsc, IsResponse),
             InvalidCmd, "StoreCondResp" },
+    /* LockedRMWReadReq */
+    { SET5(IsRead, IsLockedRMW, NeedsWritable, IsRequest, NeedsResponse),
+            LockedRMWReadResp, "LockedRMWReadReq" },
+    /* LockedRMWReadResp */
+    { SET5(IsRead, IsLockedRMW, NeedsWritable, IsResponse, HasData),
+            InvalidCmd, "LockedRMWReadResp" },
+    /* LockedRMWWriteReq */
+    { SET6(IsWrite, IsLockedRMW, NeedsWritable, IsRequest, NeedsResponse,
+           HasData), LockedRMWWriteResp, "LockedRMWWriteReq" },
+    /* LockedRMWWriteResp */
+    { SET4(IsWrite, IsLockedRMW, NeedsWritable, IsResponse),
+            InvalidCmd, "LockedRMWWriteResp" },
     /* SwapReq -- for Swap ldstub type operations */
     { SET6(IsRead, IsWrite, NeedsWritable, IsRequest, HasData, NeedsResponse),
         SwapResp, "SwapReq" },