# Node ID d8ed395159dc893a45acfb74b518377bd8421aa3
# Parent  d010e6a8e783e0ccead48292b9759eb700e66a85
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -248,6 +248,9 @@
     /** Attempts to send a store to the cache. */
     bool sendStore(PacketPtr data_pkt);
 
+    /** Attempts to send a store exclusive prefetch to the cache. */
+    bool sendStoreAccess(PacketPtr data_pkt);
+
     /** Increments the given store index (circular queue). */
     inline void incrStIdx(int &store_idx) const;
     /** Decrements the given store index (circular queue). */
@@ -509,6 +512,12 @@
     /** Number of times the LSQ is blocked due to the cache. */
     Stats::Scalar lsqCacheBlocked;
 
+    /** Number of times StoreAccess gets sent successfully */
+    Stats::Scalar lsqStoreAccessNonBlocked;
+
+    /** Number of times StoreAccess couldn't get sent successfully */
+    Stats::Scalar lsqStoreAccessBlocked;
+
   public:
     /** Executes the load at the given index. */
     Fault read(Request *req, Request *sreqLow, Request *sreqHigh,
@@ -893,15 +902,81 @@
     assert(size <= sizeof(storeQueue[store_idx].data) ||
             (req->getFlags() & Request::CACHE_BLOCK_ZERO));
 
+    bool split = false;
     // Split stores can only occur in ISAs with unaligned memory accesses.  If
     // a store request has been split, sreqLow and sreqHigh will be non-null.
     if (TheISA::HasUnalignedMemAcc && sreqLow) {
         storeQueue[store_idx].isSplit = true;
+        split = true;
     }
 
     if (!(req->getFlags() & Request::CACHE_BLOCK_ZERO))
         memcpy(storeQueue[store_idx].data, data, size);
 
+    // Issue a store access request
+    if (usedPorts < cachePorts  && !hasPendingPkt &&
+        size != 0 && !storeQueue[store_idx].inst->isDataPrefetch() &&
+        !req->isMmappedIpr() && !req->isUncacheable() &&
+        cpu->system->isMemAddr(req->getPaddr())) {
+
+        ++usedPorts;
+        DynInstPtr inst = storeQueue[store_idx].inst;
+        MemCmd command = MemCmd::StoreAccess;
+
+        PacketPtr data_pkt;
+        PacketPtr snd_data_pkt = NULL;
+
+        LSQSenderState *state = new LSQSenderState;
+        state->isLoad = false;
+        state->idx = store_idx;
+
+        if (!split) {
+            // Build a single data packet if the store isn't split.
+            data_pkt = new Packet(req, command);
+            data_pkt->senderState = state;
+        } else {
+            // Create two packets if the store is split in two.
+            data_pkt = new Packet(sreqLow, command);
+            snd_data_pkt = new Packet(sreqHigh, command);
+
+            data_pkt->senderState = state;
+            snd_data_pkt->senderState = state;
+
+            state->isSplit = true;
+            state->outstanding = 2;
+
+            req = sreqLow;
+        }
+
+        DPRINTF(LSQUnit, "D-Cache: store exclusive prefetch idx:%i PC:%s "
+                "to Addr:%#x, [sn:%lli]\n", store_idx, inst->pcState(),
+                req->getPaddr(), inst->seqNum);
+
+        if (!sendStoreAccess(data_pkt)) {
+            DPRINTF(LSQUnit, "D-Cache became blocked when writing store "
+                    "permission prefetch [sn:%lli], will NOT retry later\n",
+                    inst->seqNum);
+            ++lsqStoreAccessBlocked;
+        } else {
+            // Sent the first packet successfully.
+            // If the store is split, try to send the second packet too
+            ++lsqStoreAccessNonBlocked;
+            if (split) {
+                assert(snd_data_pkt);
+                //
+                // Ensure there are enough ports to use.
+                //
+                if (usedPorts < cachePorts) {
+                    ++usedPorts;
+                    if (!sendStoreAccess(snd_data_pkt))
+                        ++lsqStoreAccessBlocked;
+                    else
+                        ++lsqStoreAccessNonBlocked;
+                }
+            }
+        }
+    }
+
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -94,6 +94,8 @@
 void
 LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
 {
+    DPRINTF(Activity, "CompleteDataAccess for address: 0x%x; storeAccess: %s\n",
+                       pkt->getAddr(), pkt->isStoreAccess()? "yes" : "no");
     LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState);
     DynInstPtr inst = state->inst;
     DPRINTF(IEW, "Writeback event [sn:%lli].\n", inst->seqNum);
@@ -255,7 +257,18 @@
 
     lsqCacheBlocked
         .name(name() + ".cacheBlocked")
-        .desc("Number of times an access to memory failed due to the cache being blocked");
+        .desc("Number of times an access to memory failed "
+              "due to the cache being blocked");
+
+    lsqStoreAccessNonBlocked
+        .name(name() + ".storeAccessNonBlocked")
+        .desc("Number of times a StoreAccess to memory "
+              "sent successfully");
+
+    lsqStoreAccessBlocked
+        .name(name() + ".storeAccessBlocked")
+        .desc("Number of times a StoreAccess to memory failed "
+              "due to cache being blocked");
 }
 
 template<class Impl>
@@ -1223,6 +1236,17 @@
     return true;
 }
 
+// Exclusive permission prefetch
+template <class Impl>
+bool
+LSQUnit<Impl>::sendStoreAccess(PacketPtr data_pkt)
+{
+    if (!dcachePort->sendTimingReq(data_pkt)) {
+        return false;
+    }
+    return true;
+}
+
 template <class Impl>
 void
 LSQUnit<Impl>::recvRetry()
diff --git a/src/mem/abstract_mem.cc b/src/mem/abstract_mem.cc
--- a/src/mem/abstract_mem.cc
+++ b/src/mem/abstract_mem.cc
@@ -379,7 +379,7 @@
         bytesRead[pkt->req->masterId()] += pkt->getSize();
         if (pkt->req->isInstFetch())
             bytesInstRead[pkt->req->masterId()] += pkt->getSize();
-    } else if (pkt->isInvalidate()) {
+    } else if (pkt->isInvalidate() || pkt->isStoreAccess()) {
         // no need to do anything
         // this clause is intentionally before the write clause: the only
         // transaction that is both a write and an invalidate is
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -333,6 +333,10 @@
         return false;
     }
 
+    if (pkt->isStoreAccess()) {
+        DPRINTF(Cache, "StoreAccess request: %x\n", pkt->getAddr());
+    }
+
     int id = pkt->req->hasContextId() ? pkt->req->contextId() : -1;
     // Here lat is the value passed as parameter to accessBlock() function
     // that can modify its value.
@@ -372,6 +376,10 @@
         DPRINTF(Cache, "%s new state is %s\n", __func__, blk->print());
         incHitCount(pkt);
         return true;
+    } else if (pkt->isStoreAccess() && blk->isWritable()) {
+        // found a block and it's already in exclusive state
+        incHitCount(pkt);
+        return true;
     } else if ((blk != NULL) &&
                (pkt->needsExclusive() ? blk->isWritable()
                                       : blk->isReadable())) {
@@ -392,6 +400,9 @@
         return true;
     }
 
+    if (pkt->isStoreAccess()) {
+        DPRINTF(Cache, "StoreAccess miss in cache\n");
+    }
     return false;
 }
 
@@ -581,6 +592,10 @@
         assert(!pkt->req->isUncacheable());
 
         // hit (for all other request types)
+        if (pkt->isStoreAccess()) {
+            // StoreAccess hit in cache with exclusive permission, do nothing
+            return true;
+        }
 
         if (prefetcher && (prefetchOnAccess || (blk && blk->wasPrefetched()))) {
             if (blk)
@@ -593,6 +608,8 @@
 
         if (needsResponse) {
             pkt->makeTimingResponse();
+            DPRINTF(Cache, "Schedule timing response from cache, "
+                           "latency: %d, clock: %lld\n", lat, clockEdge(lat));
             // @todo: Make someone pay for this
             pkt->headerDelay = pkt->payloadDelay = 0;
 
@@ -610,6 +627,22 @@
         }
     } else {
         // miss
+        // StoreAccess prefetch
+        if (pkt->isStoreAccess()) {
+            DPRINTF(Cache, "StoreAccess missed in cache or with no "
+                           "exclusive permission\n");
+            if (prefetcher) {
+                DPRINTF(Cache, "Prefetcher enabled, calculating "
+                        "next_pf_time\n");
+                next_pf_time = prefetcher->notify(pkt);
+            } if (next_pf_time != 0) {
+                DPRINTF(Cache, "Sending prefetch request for StoreAccess\n");
+                requestMemSideBus(Request_PF,
+                        std::max(prefetcher->nextPrefetchReadyTime(),
+                        next_pf_time));
+            }
+            return true;
+        }
 
         Addr blk_addr = blockAlign(pkt->getAddr());
 
@@ -801,8 +834,6 @@
         return NULL;
     }
 
-    assert(cpu_pkt->needsResponse());
-
     MemCmd cmd;
     // @TODO make useUpgrades a parameter.
     // Note that ownership protocols require upgrade, otherwise a
diff --git a/src/mem/cache/mshr.cc b/src/mem/cache/mshr.cc
--- a/src/mem/cache/mshr.cc
+++ b/src/mem/cache/mshr.cc
@@ -215,7 +215,7 @@
     assert(targets.isReset());
     // Don't know of a case where we would allocate a new MSHR for a
     // snoop (mem-side request), so set source according to request here
-    Target::Source source = (target->cmd == MemCmd::HardPFReq) ?
+    Target::Source source = (target->cmd.isHWPrefetch()) ?
         Target::FromPrefetcher : Target::FromCPU;
     targets.add(target, when_ready, _order, source, true);
     assert(deferredTargets.isReset());
diff --git a/src/mem/cache/prefetch/queued.cc b/src/mem/cache/prefetch/queued.cc
--- a/src/mem/cache/prefetch/queued.cc
+++ b/src/mem/cache/prefetch/queued.cc
@@ -61,6 +61,37 @@
 Tick
 QueuedPrefetcher::notify(const PacketPtr &pkt)
 {
+    Tick pf_time = curTick() + clockPeriod() * latency;
+
+    if (pkt->isStoreAccess()) {
+        // convert storeAccess to HardPFExReq
+        Addr blk_addr = pkt->getAddr() & ~(Addr)(blkSize - 1);
+        Request *prefetchReq = new Request(blk_addr, blkSize, 0, masterId);
+        PacketPtr prefetch = new Packet(prefetchReq, MemCmd::HardPFExReq);
+
+        DPRINTF(HWPrefetch, "Add StoreAccess request "
+                            "with blk_addr: 0x%x to pf queue\n", blk_addr);
+
+        prefetch->allocate();
+        prefetch->req->setThreadContext(pkt->req->contextId(),
+                                        pkt->req->threadId());
+
+        // We just remove the head if we are full
+        if (pfq.size() == queueSize) {
+            pfRemovedFull++;
+            PacketPtr old_pkt = pfq.begin()->pkt;
+            DPRINTF(HWPrefetch, "Prefetch queue full, removing oldest 0x%x\n",
+                                old_pkt->getAddr());
+            delete old_pkt->req;
+            delete old_pkt;
+            pfq.pop_front();
+        }
+
+        pfq.emplace_back(DeferredPacket(pf_time, prefetch));
+
+        return pfq.empty() ? MaxTick : pfq.front().tick;
+    }
+
     // Verify this access type is observed by prefetcher
     if (observeAccess(pkt)) {
         Addr blk_addr = pkt->getAddr() & ~(Addr)(blkSize - 1);
@@ -145,7 +176,6 @@
                 pfq.pop_front();
             }
 
-            Tick pf_time = curTick() + clockPeriod() * latency;
             DPRINTF(HWPrefetch, "Prefetch queued. "
                     "addr:%#x tick:%lld.\n", pf_addr, pf_time);
 
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -89,6 +89,7 @@
         Writeback,
         SoftPFReq,
         HardPFReq,
+        HardPFExReq,
         SoftPFResp,
         HardPFResp,
         WriteInvalidateReq,
@@ -120,6 +121,7 @@
         PrintReq,       // Print state matching address
         FlushReq,      //request for a cache flush
         InvalidationReq,   // request for address to be invalidated from lsq
+        StoreAccess,    //exclusive permission
         NUM_MEM_CMDS
     };
 
@@ -144,6 +146,7 @@
         IsError,        //!< Error response
         IsPrint,        //!< Print state matching address (for debugging)
         IsFlush,        //!< Flush the address from caches
+        IsStoreAccess,  //!< exclusive permission
         NUM_COMMAND_ATTRIBUTES
     };
 
@@ -202,6 +205,7 @@
     bool isError() const        { return testCmdAttrib(IsError); }
     bool isPrint() const        { return testCmdAttrib(IsPrint); }
     bool isFlush() const        { return testCmdAttrib(IsFlush); }
+    bool isStoreAccess() const  { return testCmdAttrib(IsStoreAccess); }
 
     const Command
     responseCommand() const
@@ -486,6 +490,7 @@
     bool isError() const             { return cmd.isError(); }
     bool isPrint() const             { return cmd.isPrint(); }
     bool isFlush() const             { return cmd.isFlush(); }
+    bool isStoreAccess() const       { return cmd.isStoreAccess(); }
 
     // Snoop flags
     void assertMemInhibit()
@@ -554,8 +559,8 @@
     }
 
     /**
-     * When ruby is in use, Ruby will monitor the cache line and thus M5 
-     * phys memory should treat LL ops as normal reads. 
+     * When ruby is in use, Ruby will monitor the cache line and thus M5
+     * phys memory should treat LL ops as normal reads.
      */
     void
     convertLlToRead()
@@ -695,8 +700,12 @@
         // If this is a request packet for which there's no response,
         // delete the request object here, since the requester will
         // never get the chance.
-        if (req && isRequest() && !needsResponse())
+        if (req && isRequest() && !needsResponse()
+            && !cmd.isHWPrefetch()) {
+            // If HWExPrefetch, it doesn't need a response, hence already
+            // deletes req when getting the response in handleResponse function
             delete req;
+        }
         deleteData();
     }
 
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -92,6 +92,9 @@
     /* HardPFReq */
     { SET4(IsRead, IsRequest, IsHWPrefetch, NeedsResponse),
             HardPFResp, "HardPFReq" },
+    /* HardPFExReq */
+    { SET4(IsRead, IsRequest, IsHWPrefetch, NeedsExclusive),
+            InvalidCmd, "HardPFExReq" },
     /* SoftPFResp */
     { SET4(IsRead, IsResponse, IsSWPrefetch, HasData),
             InvalidCmd, "SoftPFResp" },
@@ -170,6 +173,9 @@
     /* Invalidation Request */
     { SET3(NeedsExclusive, IsInvalidate, IsRequest),
       InvalidCmd, "InvalidationReq" },
+    /* StoreAccess Request */
+    { SET3(NeedsExclusive, IsRequest, IsStoreAccess), InvalidCmd,
+           "StoreAccessReq" },
 };
 
 bool
diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm
--- a/src/mem/protocol/RubySlicc_Exports.sm
+++ b/src/mem/protocol/RubySlicc_Exports.sm
@@ -135,6 +135,7 @@
   COMMIT,            desc="Commit version";
   NULL,              desc="Invalid request type";
   FLUSH,             desc="Flush request type";
+  Store_Access,      desc="Store Permission Prefetch";
 }
 
 enumeration(SequencerRequestType, desc="...", default="SequencerRequestType_NULL") {
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -170,6 +170,7 @@
 
   protected:
     void trySendRetries();
+    void retry_blocked_port(PacketPtr pkt);
     void ruby_hit_callback(PacketPtr pkt);
     void testDrainComplete();
     void ruby_eviction_callback(const Address& address);
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -352,6 +352,24 @@
 }
 
 void
+RubyPort::retry_blocked_port(PacketPtr pkt)
+{
+    DPRINTF(RubyPort, "Potential blocked port for request: %s 0x%x\n",
+            pkt->cmdString(),
+            pkt->getAddr());
+
+    // Retrieve the request port from the sender State
+    RubyPort::SenderState *senderState =
+        safe_cast<RubyPort::SenderState *>(pkt->popSenderState());
+    MemSlavePort *port = senderState->port;
+    assert(port != NULL);
+    delete senderState;
+
+    if (!onRetryList(port)) {
+        addToRetryList(port);
+    }
+}
+void
 RubyPort::ruby_hit_callback(PacketPtr pkt)
 {
     DPRINTF(RubyPort, "Hit callback for %s 0x%x\n", pkt->cmdString(),
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -389,7 +389,8 @@
     while (!i->second->empty()) {
         request = i->second->front();
         if(ruby_request) {
-            assert((request->m_type == RubyRequestType_ST) ||
+            assert((request->m_type == RubyRequestType_Store_Access) ||
+                   (request->m_type == RubyRequestType_ST) ||
                    (request->m_type == RubyRequestType_ATOMIC) ||
                    (request->m_type == RubyRequestType_RMW_Read) ||
                    (request->m_type == RubyRequestType_RMW_Write) ||
@@ -538,7 +539,7 @@
     if (g_system_ptr->m_warmup_enabled) {
         data.setData(pkt->getConstPtr<uint8_t>(),
                      request_address.getOffset(), pkt->getSize());
-    } else if (!pkt->isFlush()) {
+    } else if (!pkt->isFlush() && !pkt->isStoreAccess()) {
         if ((type == RubyRequestType_LD) ||
             (type == RubyRequestType_IFETCH) ||
             (type == RubyRequestType_RMW_Read) ||
@@ -578,7 +579,13 @@
         delete pkt;
         g_system_ptr->m_cache_recorder->enqueueNextFlushRequest();
     } else {
-        ruby_hit_callback(pkt);
+        if (type != RubyRequestType_Store_Access) {
+            ruby_hit_callback(pkt);
+        } else {
+            // A storeAccess can block the CPU from issuing stores, so call
+            // retry_blocked_port so the CPU can reissue blocked stores.
+            retry_blocked_port(pkt);
+        }
     }
 }
 
@@ -597,8 +604,12 @@
 
     RubyRequestType primary_type = RubyRequestType_NULL;
     RubyRequestType secondary_type = RubyRequestType_NULL;
-
-    if (pkt->isLLSC()) {
+    if (pkt->isStoreAccess()) {
+        DPRINTF(RubySequencer, "Issuing Store Access\n");
+        primary_type = RubyRequestType_Store_Access;
+        secondary_type = RubyRequestType_ST;
+    }
+    else if (pkt->isLLSC()) {
         //
         // Alpha LL/SC instructions need to be handled carefully by the cache
         // coherence protocol to ensure they follow the proper semantics. In
@@ -691,12 +702,18 @@
         pc = pkt->req->getPC();
     }
 
+    // Certain requests will not have allocated data
+    uint8_t* data;
+    if (pkt->isFlush() || pkt->isStoreAccess()) {
+        data = nullptr;
+    } else {
+        data = pkt->getPtr<uint8_t>();
+    }
+
     // check if the packet has data as for example prefetch and flush
     // requests do not
     std::shared_ptr<RubyRequest> msg =
-        std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                                      pkt->isFlush() ?
-                                      nullptr : pkt->getPtr<uint8_t>(),
+        std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), data,
                                       pkt->getSize(), pc, secondary_type,
                                       RubyAccessMode_Supervisor, pkt,
                                       PrefetchBit_No, proc_id);