# Node ID e5e740fac1fb0a66ce81690c93f7768340c16c24
# Parent  915989ed4b8a4f17e5cde158c8123b5b9860e285
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -247,6 +247,9 @@
     /** Attempts to send a store to the cache. */
     bool sendStore(PacketPtr data_pkt);
 
+    /** Attempts to send a store exclusive prefetch to the cache. */
+    bool sendStoreAccess(PacketPtr data_pkt);
+
     /** Increments the given store index (circular queue). */
     inline void incrStIdx(int &store_idx) const;
     /** Decrements the given store index (circular queue). */
@@ -508,6 +511,12 @@
     /** Number of times the LSQ is blocked due to the cache. */
     Stats::Scalar lsqCacheBlocked;
 
+    /** Number of times StoreAccess gets sent successfully */
+    Stats::Scalar lsqStoreAccessNonBlocked;
+
+    /** Number of times StoreAccess couldn't get sent successfully */
+    Stats::Scalar lsqStoreAccessBlocked;
+
   public:
     /** Executes the load at the given index. */
     Fault read(Request *req, Request *sreqLow, Request *sreqHigh,
@@ -892,15 +901,81 @@
     assert(size <= sizeof(storeQueue[store_idx].data) ||
             (req->getFlags() & Request::CACHE_BLOCK_ZERO));
 
+    bool split = false;
     // Split stores can only occur in ISAs with unaligned memory accesses.  If
     // a store request has been split, sreqLow and sreqHigh will be non-null.
     if (TheISA::HasUnalignedMemAcc && sreqLow) {
         storeQueue[store_idx].isSplit = true;
+        split = true;
     }
 
     if (!(req->getFlags() & Request::CACHE_BLOCK_ZERO))
         memcpy(storeQueue[store_idx].data, data, size);
 
+    // Issue a store access request
+    if (usedPorts < cachePorts  && !hasPendingPkt &&
+        size != 0 && !storeQueue[store_idx].inst->isDataPrefetch() &&
+        !req->isMmappedIpr() && !req->isUncacheable() &&
+        cpu->system->isMemAddr(req->getPaddr())) {
+
+        ++usedPorts;
+        DynInstPtr inst = storeQueue[store_idx].inst;
+        MemCmd command = MemCmd::StoreAccess;
+
+        PacketPtr data_pkt;
+        PacketPtr snd_data_pkt = NULL;
+
+        LSQSenderState *state = new LSQSenderState;
+        state->isLoad = false;
+        state->idx = store_idx;
+
+        if (!split) {
+            // Build a single data packet if the store isn't split.
+            data_pkt = new Packet(req, command);
+            data_pkt->senderState = state;
+        } else {
+            // Create two packets if the store is split in two.
+            data_pkt = new Packet(sreqLow, command);
+            snd_data_pkt = new Packet(sreqHigh, command);
+
+            data_pkt->senderState = state;
+            snd_data_pkt->senderState = state;
+
+            state->isSplit = true;
+            state->outstanding = 2;
+
+            req = sreqLow;
+        }
+
+        DPRINTF(LSQUnit, "D-Cache: store exclusive prefetch idx:%i PC:%s "
+                "to Addr:%#x, [sn:%lli]\n", store_idx, inst->pcState(),
+                req->getPaddr(), inst->seqNum);
+
+        if (!sendStoreAccess(data_pkt)) {
+            DPRINTF(LSQUnit, "D-Cache became blocked when writing store "
+                    "permission prefetch [sn:%lli], will NOT retry later\n",
+                    inst->seqNum);
+            ++lsqStoreAccessBlocked;
+        } else {
+            // Sent the first packet successfully.
+            // If the store is split, try to send the second packet too
+            ++lsqStoreAccessNonBlocked;
+            if (split) {
+                assert(snd_data_pkt);
+                //
+                // Ensure there are enough ports to use.
+                //
+                if (usedPorts < cachePorts) {
+                    ++usedPorts;
+                    if (!sendStoreAccess(snd_data_pkt))
+                        ++lsqStoreAccessBlocked;
+                    else
+                        ++lsqStoreAccessNonBlocked;
+                }
+            }
+        }
+    }
+
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -94,6 +94,8 @@
 void
 LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
 {
+    DPRINTF(Activity, "CompleteDataAccess for address: 0x%x; storeAccess: %s\n",
+                       pkt->getAddr(), pkt->isStoreAccess()? "yes" : "no");
     LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState);
     DynInstPtr inst = state->inst;
     DPRINTF(IEW, "Writeback event [sn:%lli].\n", inst->seqNum);
@@ -255,7 +257,18 @@
 
     lsqCacheBlocked
         .name(name() + ".cacheBlocked")
-        .desc("Number of times an access to memory failed due to the cache being blocked");
+        .desc("Number of times an access to memory failed "
+              "due to the cache being blocked");
+
+    lsqStoreAccessNonBlocked
+        .name(name() + ".storeAccessNonBlocked")
+        .desc("Number of times a StoreAccess to memory "
+              "sent successfully");
+
+    lsqStoreAccessBlocked
+        .name(name() + ".storeAccessBlocked")
+        .desc("Number of times a StoreAccess to memory failed "
+              "due to cache being blocked");
 }
 
 template<class Impl>
@@ -1229,6 +1242,17 @@
     return true;
 }
 
+// Exclusive permission prefetch
+template <class Impl>
+bool
+LSQUnit<Impl>::sendStoreAccess(PacketPtr data_pkt)
+{
+    if (!dcachePort->sendTimingReq(data_pkt)) {
+        return false;
+    }
+    return true;
+}
+
 template <class Impl>
 void
 LSQUnit<Impl>::recvRetry()
diff --git a/src/mem/abstract_mem.cc b/src/mem/abstract_mem.cc
--- a/src/mem/abstract_mem.cc
+++ b/src/mem/abstract_mem.cc
@@ -386,7 +386,7 @@
         bytesRead[pkt->req->masterId()] += pkt->getSize();
         if (pkt->req->isInstFetch())
             bytesInstRead[pkt->req->masterId()] += pkt->getSize();
-    } else if (pkt->isInvalidate()) {
+    } else if (pkt->isInvalidate() || pkt->isStoreAccess()) {
         // no need to do anything
         // this clause is intentionally before the write clause: the only
         // transaction that is both a write and an invalidate is
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -305,6 +305,10 @@
         return false;
     }
 
+    if (pkt->isStoreAccess()) {
+        DPRINTF(Cache, "StoreAccess request: %x\n", pkt->getAddr());
+    }
+
     ContextID id = pkt->req->hasContextId() ?
         pkt->req->contextId() : InvalidContextID;
     // Here lat is the value passed as parameter to accessBlock() function
@@ -390,6 +394,10 @@
         // like a Writeback which could not find a replaceable block so has to
         // go to next level.
         return false;
+    } else if (pkt->isStoreAccess() && blk && blk->isWritable()) {
+        // found a block and it's already in exclusive state
+        incHitCount(pkt);
+        return true;
     } else if ((blk != NULL) &&
                (pkt->needsExclusive() ? blk->isWritable()
                                       : blk->isReadable())) {
@@ -410,6 +418,10 @@
         return true;
     }
 
+    if (pkt->isStoreAccess()) {
+        DPRINTF(Cache, "StoreAccess miss in cache\n");
+    }
+
     return false;
 }
 
@@ -656,6 +668,10 @@
         assert(!pkt->req->isUncacheable());
 
         // hit (for all other request types)
+        if (pkt->isStoreAccess()) {
+            // StoreAccess hit in cache with exclusive permission, do nothing
+            return true;
+        }
 
         if (prefetcher && (prefetchOnAccess || (blk && blk->wasPrefetched()))) {
             if (blk)
@@ -668,6 +684,8 @@
 
         if (needsResponse) {
             pkt->makeTimingResponse();
+            DPRINTF(Cache, "Schedule timing response from cache, "
+                           "latency: %d, clock: %lld\n", lat, clockEdge(lat));
             // @todo: Make someone pay for this
             pkt->headerDelay = pkt->payloadDelay = 0;
 
@@ -687,6 +705,20 @@
         }
     } else {
         // miss
+        // StoreAccess prefetch
+        if (pkt->isStoreAccess()) {
+            DPRINTF(Cache, "StoreAccess missed in cache or with no "
+                           "exclusive permission\n");
+            if (prefetcher) {
+                DPRINTF(Cache, "Prefetcher enabled, calculating "
+                               "next_pf_time\n");
+                next_pf_time = prefetcher->notify(pkt);
+                DPRINTF(Cache, "Sending prefetch request for StoreAccess\n");
+                schedMemSideSendEvent(next_pf_time);
+
+            }
+            return true;
+        }
 
         Addr blk_addr = blockAlign(pkt->getAddr());
 
@@ -879,8 +911,6 @@
         return NULL;
     }
 
-    assert(cpu_pkt->needsResponse());
-
     MemCmd cmd;
     // @TODO make useUpgrades a parameter.
     // Note that ownership protocols require upgrade, otherwise a
@@ -904,7 +934,7 @@
         // forward as invalidate to all other caches, this gives us
         // the line in exclusive state, and invalidates all other
         // copies
-        cmd = MemCmd::InvalidateReq;
+        cmd = MemCmd::InvalidationReq;
     } else {
         // block is invalid
         cmd = needsExclusive ? MemCmd::ReadExReq :
@@ -1027,7 +1057,7 @@
                 if (bus_pkt->isError()) {
                     pkt->makeAtomicResponse();
                     pkt->copyError(bus_pkt);
-                } else if (pkt->cmd == MemCmd::InvalidateReq) {
+                } else if (pkt->cmd == MemCmd::InvalidationReq) {
                     if (blk) {
                         // invalidate response to a cache that received
                         // an invalidate request
@@ -1857,7 +1887,7 @@
     // an invalidate, we don't need to send a response. The
     // invalidation itself is taken care of below.
     bool respond = blk->isDirty() && pkt->needsResponse() &&
-        pkt->cmd != MemCmd::InvalidateReq;
+        pkt->cmd != MemCmd::InvalidationReq;
     bool have_exclusive = blk->isWritable();
 
     // Invalidate any prefetch's from below that would strip write permissions
diff --git a/src/mem/cache/mshr.cc b/src/mem/cache/mshr.cc
--- a/src/mem/cache/mshr.cc
+++ b/src/mem/cache/mshr.cc
@@ -218,7 +218,7 @@
     assert(targets.isReset());
     // Don't know of a case where we would allocate a new MSHR for a
     // snoop (mem-side request), so set source according to request here
-    Target::Source source = (target->cmd == MemCmd::HardPFReq) ?
+    Target::Source source = (target->cmd.isHWPrefetch()) ?
         Target::FromPrefetcher : Target::FromCPU;
     targets.add(target, when_ready, _order, source, true);
     assert(deferredTargets.isReset());
diff --git a/src/mem/cache/prefetch/queued.cc b/src/mem/cache/prefetch/queued.cc
--- a/src/mem/cache/prefetch/queued.cc
+++ b/src/mem/cache/prefetch/queued.cc
@@ -61,6 +61,37 @@
 Tick
 QueuedPrefetcher::notify(const PacketPtr &pkt)
 {
+    Tick pf_time = curTick() + clockPeriod() * latency;
+
+    if (pkt->isStoreAccess()) {
+        // convert storeAccess to HardPFExReq
+        Addr blk_addr = pkt->getAddr() & ~(Addr)(blkSize - 1);
+        Request *prefetchReq = new Request(blk_addr, blkSize, 0, masterId);
+        PacketPtr prefetch = new Packet(prefetchReq, MemCmd::HardPFExReq);
+
+        DPRINTF(HWPrefetch, "Add StoreAccess request "
+                            "with blk_addr: 0x%x to pf queue\n", blk_addr);
+
+        prefetch->allocate();
+        prefetch->req->setThreadContext(pkt->req->contextId(),
+                                        pkt->req->threadId());
+
+        // We just remove the head if we are full
+        if (pfq.size() == queueSize) {
+            pfRemovedFull++;
+            PacketPtr old_pkt = pfq.begin()->pkt;
+            DPRINTF(HWPrefetch, "Prefetch queue full, removing oldest 0x%x\n",
+                                old_pkt->getAddr());
+            delete old_pkt->req;
+            delete old_pkt;
+            pfq.pop_front();
+        }
+
+        pfq.emplace_back(DeferredPacket(pf_time, prefetch));
+
+        return pfq.empty() ? MaxTick : pfq.front().tick;
+    }
+
     // Verify this access type is observed by prefetcher
     if (observeAccess(pkt)) {
         Addr blk_addr = pkt->getAddr() & ~(Addr)(blkSize - 1);
@@ -142,7 +173,6 @@
                 pfq.pop_front();
             }
 
-            Tick pf_time = curTick() + clockPeriod() * latency;
             DPRINTF(HWPrefetch, "Prefetch queued. "
                     "addr:%#x tick:%lld.\n", pf_addr, pf_time);
 
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -90,6 +90,7 @@
         CleanEvict,
         SoftPFReq,
         HardPFReq,
+        HardPFExReq,
         SoftPFResp,
         HardPFResp,
         WriteLineReq,
@@ -125,8 +126,9 @@
         // Fake simulator-only commands
         PrintReq,       // Print state matching address
         FlushReq,      //request for a cache flush
-        InvalidateReq,   // request for address to be invalidated
-        InvalidateResp,
+        InvalidationReq,   // request for address to be invalidated from lsq
+        InvalidationResp,
+        StoreAccess,    //exclusive permission
         NUM_MEM_CMDS
     };
 
@@ -153,6 +155,7 @@
         IsFlush,        //!< Flush the address from caches
         IsAcquire,      //!< Acquire operation
         IsRelease,      //!< Release operation
+        IsStoreAccess,  //!< exclusive permission
         NUM_COMMAND_ATTRIBUTES
     };
 
@@ -211,6 +214,7 @@
     bool isFlush() const        { return testCmdAttrib(IsFlush); }
     bool isAcquire() const      { return testCmdAttrib(IsAcquire); }
     bool isRelease() const      { return testCmdAttrib(IsRelease); }
+    bool isStoreAccess() const  { return testCmdAttrib(IsStoreAccess); }
 
     const Command
     responseCommand() const
@@ -502,6 +506,7 @@
     bool isFlush() const             { return cmd.isFlush(); }
     bool isAcquire() const           { return cmd.isAcquire(); }
     bool isRelease() const           { return cmd.isRelease(); }
+    bool isStoreAccess() const       { return cmd.isStoreAccess(); }
 
     // Snoop flags
     void assertMemInhibit()
@@ -726,7 +731,10 @@
         // needed (CleanEvict and Writeback), since the snoop packet
         // re-uses the same request.
         if (req && isRequest() && !needsResponse() &&
-            !isExpressSnoop()) {
+            !isExpressSnoop() && !cmd.isHWPrefetch()) {
+            // If the request is a HWExPrefetch it doesn't need a response
+            // because it already deletes the request when getting the
+            // response in handleResponse()
             delete req;
         }
         deleteData();
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -95,6 +95,9 @@
     /* HardPFReq */
     { SET4(IsRead, IsRequest, IsHWPrefetch, NeedsResponse),
             HardPFResp, "HardPFReq" },
+    /* HardPFExReq */
+    { SET4(IsRead, IsRequest, IsHWPrefetch, NeedsExclusive),
+            InvalidCmd, "HardPFExReq" },
     /* SoftPFResp */
     { SET4(IsRead, IsResponse, IsSWPrefetch, HasData),
             InvalidCmd, "SoftPFResp" },
@@ -187,10 +190,13 @@
     { SET3(IsRequest, IsFlush, NeedsExclusive), InvalidCmd, "FlushReq" },
     /* Invalidation Request */
     { SET4(IsInvalidate, IsRequest, NeedsExclusive, NeedsResponse),
-      InvalidateResp, "InvalidateReq" },
+      InvalidationResp, "InvalidationReq" },
     /* Invalidation Response */
     { SET3(IsInvalidate, IsResponse, NeedsExclusive),
-      InvalidCmd, "InvalidateResp" }
+      InvalidCmd, "InvalidationResp" },
+    /* StoreAccess Request */
+    { SET3(NeedsExclusive, IsRequest, IsStoreAccess), InvalidCmd,
+           "StoreAccessReq" },
 };
 
 bool
diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm
--- a/src/mem/protocol/RubySlicc_Exports.sm
+++ b/src/mem/protocol/RubySlicc_Exports.sm
@@ -140,6 +140,7 @@
   Release,           desc="Release operation";
   Acquire,           desc="Acquire opertion";
   AcquireRelease,    desc="Acquire and Release opertion";
+  Store_Access,      desc="Store Permission Prefetch";
 }
 
 enumeration(SequencerRequestType, desc="...", default="SequencerRequestType_NULL") {
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -169,6 +169,7 @@
 
   protected:
     void trySendRetries();
+    void retry_blocked_port(PacketPtr pkt);
     void ruby_hit_callback(PacketPtr pkt);
     void testDrainComplete();
     void ruby_eviction_callback(Addr address);
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -354,6 +354,24 @@
 }
 
 void
+RubyPort::retry_blocked_port(PacketPtr pkt)
+{
+    DPRINTF(RubyPort, "Potential blocked port for request: %s 0x%x\n",
+            pkt->cmdString(),
+            pkt->getAddr());
+
+    // Retrieve the request port from the sender State
+    RubyPort::SenderState *senderState =
+        safe_cast<RubyPort::SenderState *>(pkt->popSenderState());
+    MemSlavePort *port = senderState->port;
+    assert(port != NULL);
+    delete senderState;
+
+    if (!onRetryList(port)) {
+        addToRetryList(port);
+    }
+}
+void
 RubyPort::ruby_hit_callback(PacketPtr pkt)
 {
     DPRINTF(RubyPort, "Hit callback for %s 0x%x\n", pkt->cmdString(),
@@ -531,7 +549,7 @@
                     Request::funcMasterId);
     // Use a single packet to signal all snooping ports of the invalidation.
     // This assumes that snooping ports do NOT modify the packet/request
-    Packet pkt(&request, MemCmd::InvalidateReq);
+    Packet pkt(&request, MemCmd::InvalidationReq);
     for (CpuPortIter p = slave_ports.begin(); p != slave_ports.end(); ++p) {
         // check if the connected master port is snooping
         if ((*p)->isSnooping()) {
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -338,7 +338,8 @@
     while (!i->second->empty()) {
         request = i->second->front();
         if (ruby_request) {
-            assert((request->m_type == RubyRequestType_ST) ||
+            assert((request->m_type == RubyRequestType_Store_Access) ||
+                   (request->m_type == RubyRequestType_ST) ||
                    (request->m_type == RubyRequestType_ATOMIC) ||
                    (request->m_type == RubyRequestType_RMW_Read) ||
                    (request->m_type == RubyRequestType_RMW_Write) ||
@@ -481,7 +482,7 @@
     if (RubySystem::getWarmupEnabled()) {
         data.setData(pkt->getConstPtr<uint8_t>(),
                      getOffset(request_address), pkt->getSize());
-    } else if (!pkt->isFlush()) {
+    } else if (!pkt->isFlush() && !pkt->isStoreAccess()) {
         if ((type == RubyRequestType_LD) ||
             (type == RubyRequestType_IFETCH) ||
             (type == RubyRequestType_RMW_Read) ||
@@ -522,7 +523,13 @@
         delete pkt;
         rs->m_cache_recorder->enqueueNextFlushRequest();
     } else {
-        ruby_hit_callback(pkt);
+        if (type != RubyRequestType_Store_Access) {
+            ruby_hit_callback(pkt);
+        } else {
+            // A storeAccess can block the CPU from issuing stores, so call
+            // retry_blocked_port so the CPU can reissue blocked stores.
+            retry_blocked_port(pkt);
+        }
     }
 }
 
@@ -541,8 +548,12 @@
 
     RubyRequestType primary_type = RubyRequestType_NULL;
     RubyRequestType secondary_type = RubyRequestType_NULL;
-
-    if (pkt->isLLSC()) {
+    if (pkt->isStoreAccess()) {
+        DPRINTF(RubySequencer, "Issuing Store Access\n");
+        primary_type = RubyRequestType_Store_Access;
+        secondary_type = RubyRequestType_ST;
+    }
+    else if (pkt->isLLSC()) {
         //
         // Alpha LL/SC instructions need to be handled carefully by the cache
         // coherence protocol to ensure they follow the proper semantics. In
@@ -633,12 +644,18 @@
         pc = pkt->req->getPC();
     }
 
+    // Certain requests will not have allocated data
+    uint8_t* data;
+    if (pkt->isFlush() || pkt->isStoreAccess()) {
+        data = nullptr;
+    } else {
+        data = pkt->getPtr<uint8_t>();
+    }
+
     // check if the packet has data as for example prefetch and flush
     // requests do not
     std::shared_ptr<RubyRequest> msg =
-        std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                                      pkt->isFlush() ?
-                                      nullptr : pkt->getPtr<uint8_t>(),
+        std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), data,
                                       pkt->getSize(), pc, secondary_type,
                                       RubyAccessMode_Supervisor, pkt,
                                       PrefetchBit_No, proc_id);