# Node ID 4da865dce0dae6c70c7bed8f115280e779d1c654
# Parent  f377969aa1b20cd4724021b6bb003c10eb8ac9d0
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -265,6 +265,9 @@
     /** Attempts to send a store to the cache. */
     bool sendStore(PacketPtr data_pkt);
 
+    /** Attempts to send a store exclusive prefetch to the cache. */
+    bool sendStoreAccess(PacketPtr data_pkt);
+
     /** Increments the given store index (circular queue). */
     inline void incrStIdx(int &store_idx) const;
     /** Decrements the given store index (circular queue). */
@@ -532,6 +535,14 @@
     /** Number of times the LSQ is blocked due to the cache. */
     Stats::Scalar lsqCacheBlocked;
 
+    /** Number of times StoreAccess gets sent successfully */
+    Stats::Scalar lsqStoreAccessNonBlocked;
+
+    /** Number of times StoreAccess couldn't get sent successfully */
+    Stats::Scalar lsqStoreAccessBlocked;
+
+
+
   public:
     /** Executes the load at the given index. */
     Fault read(Request *req, Request *sreqLow, Request *sreqHigh,
@@ -935,13 +946,89 @@
 
     // Split stores can only occur in ISAs with unaligned memory accesses.  If
     // a store request has been split, sreqLow and sreqHigh will be non-null.
+    bool split = false;
     if (TheISA::HasUnalignedMemAcc && sreqLow) {
         storeQueue[store_idx].isSplit = true;
+        split = true;
     }
 
     if (!(req->getFlags() & Request::CACHE_BLOCK_ZERO))
         memcpy(storeQueue[store_idx].data, data, size);
 
+    //
+    //store access request
+    //
+    if (usedPorts < cachePorts  && !hasPendingPkt &&
+                /*!isStoreBlocked && */ !lsq->cacheBlocked() &&
+                size != 0 && !storeQueue[store_idx].inst->isDataPrefetch() &&
+                !req->isMmappedIpr() && !req->isUncacheable()) {
+
+        ++usedPorts;
+        DynInstPtr inst = storeQueue[store_idx].inst;
+        MemCmd command = MemCmd::StoreAccess;
+
+        PacketPtr data_pkt;
+        PacketPtr snd_data_pkt = NULL;
+
+        LSQSenderState *state = new LSQSenderState;
+        state->isLoad = false;
+        state->idx = store_idx;
+
+        if (!split) {
+            //
+            // Build a single data packet if the store isn't split.
+            //
+            data_pkt = new Packet(req, command);
+            data_pkt->senderState = state;
+        } else {
+            //
+            // Create two packets if the store is split in two.
+            //
+            data_pkt = new Packet(sreqLow, command);
+            snd_data_pkt = new Packet(sreqHigh, command);
+
+            data_pkt->senderState = state;
+            snd_data_pkt->senderState = state;
+
+            state->isSplit = true;
+            state->outstanding = 2;
+
+            req = sreqLow;
+        }
+
+        DPRINTF(LSQUnit, "D-Cache: store exclusive prefetch idx:%i PC:%s "
+                        "to Addr:%#x, [sn:%lli]\n",
+                        store_idx, inst->pcState(),
+                        req->getPaddr(),
+                        inst->seqNum);
+
+        if (!sendStoreAccess(data_pkt)) {
+            DPRINTF(LSQUnit, "D-Cache became blocked when writing store "
+                             "prefetch [sn:%lli], will NOT retry later\n",
+                             inst->seqNum);
+            ++lsqStoreAccessBlocked;
+        } else {
+            //
+            //sent the first packet through
+            // If split, try to send the second packet too
+            //
+            ++lsqStoreAccessNonBlocked;
+            if (split) {
+                assert(snd_data_pkt);
+                //
+                // Ensure there are enough ports to use.
+                //
+                if (usedPorts < cachePorts) {
+                    ++usedPorts;
+                    if (!sendStoreAccess(snd_data_pkt))
+                        ++lsqStoreAccessBlocked;
+                    else
+                        ++lsqStoreAccessNonBlocked;
+                }
+            }
+        }
+    }
+
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -94,6 +94,8 @@
 void
 LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
 {
+    DPRINTF(Activity, "CompleteDataAccess for address: 0x%x; storeAccess: %s\n",
+                       pkt->getAddr(), pkt->isStoreAccess()? "yes" : "no");
     LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState);
     DynInstPtr inst = state->inst;
     DPRINTF(IEW, "Writeback event [sn:%lli].\n", inst->seqNum);
@@ -259,7 +261,18 @@
 
     lsqCacheBlocked
         .name(name() + ".cacheBlocked")
-        .desc("Number of times an access to memory failed due to the cache being blocked");
+        .desc("Number of times an access to memory failed "
+              "due to the cache being blocked");
+
+    lsqStoreAccessNonBlocked
+        .name(name() + ".storeAccessNonBlocked")
+        .desc("Number of times a StoreAccess to memory "
+              "sent successfully");
+
+    lsqStoreAccessBlocked
+        .name(name() + ".storeAccessBlocked")
+        .desc("Number of times a StoreAccess to memory failed "
+              "due to cache being blocked");
 }
 
 template<class Impl>
@@ -1227,6 +1240,17 @@
     return true;
 }
 
+//exclusive permission prefetch
+template <class Impl>
+bool
+LSQUnit<Impl>::sendStoreAccess(PacketPtr data_pkt)
+{
+    if (!dcachePort->sendTimingReq(data_pkt)) {
+        return false;
+    }
+    return true;
+}
+
 template <class Impl>
 void
 LSQUnit<Impl>::recvRetry()
diff --git a/src/mem/abstract_mem.cc b/src/mem/abstract_mem.cc
--- a/src/mem/abstract_mem.cc
+++ b/src/mem/abstract_mem.cc
@@ -385,9 +385,10 @@
             numWrites[pkt->req->masterId()]++;
             bytesWritten[pkt->req->masterId()] += pkt->getSize();
         }
-    } else if (pkt->isInvalidate()) {
+    } else if (pkt->isInvalidate() || pkt->isStoreAccess()) {
         // no need to do anything
-    } else {
+    }
+    else {
         panic("unimplemented");
     }
 
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -309,7 +309,13 @@
             blk ? "hit" : "miss", blk ? blk->print() : "");
 
     if (blk != NULL) {
-
+        //
+        //found a block and it's already in exclusive state
+        //
+        if (pkt->isStoreAccess() && blk->isWritable()) {
+            incHitCount(pkt);
+            return true;
+        }
         if (pkt->needsExclusive() ? blk->isWritable() : blk->isReadable()) {
             // OK to satisfy access
             incHitCount(pkt);
@@ -510,6 +516,10 @@
     bool needsResponse = pkt->needsResponse();
 
     if (satisfied) {
+        if (pkt->isStoreAccess()) {
+            //StoreAccess hit in cache with exclusive permission, do nothing
+            return true;
+        }
         if (prefetcher && (prefetchOnAccess || (blk && blk->wasPrefetched()))) {
             if (blk)
                 blk->status &= ~BlkHWPrefetched;
@@ -518,6 +528,8 @@
 
         if (needsResponse) {
             pkt->makeTimingResponse();
+            DPRINTF(Cache, "Schedule timing response from cache, "
+                           "latency: %d, clock: %lld\n", lat, clockEdge(lat));
             // @todo: Make someone pay for this
             pkt->busFirstWordDelay = pkt->busLastWordDelay = 0;
             cpuSidePort->schedTimingResp(pkt, clockEdge(lat));
@@ -529,6 +541,20 @@
         }
     } else {
         // miss
+        //StoreAccess prefetch
+        if (pkt->isStoreAccess()) {
+            DPRINTF(Cache, "StoreAccess missed in cache or with no "
+                           "exclusive permission\n");
+            if (prefetcher) {
+                DPRINTF(Cache, "Prefetcher enabled, calculating next_pf_time\n");
+                next_pf_time = prefetcher->notify(pkt, time);
+            }
+            if (next_pf_time != 0) {
+                DPRINTF(Cache, "Sending prefetch request for StoreAccess\n");
+                requestMemSideBus(Request_PF, std::max(time, next_pf_time));
+            }
+            return true;
+        }
 
         // @todo: Make someone pay for this
         pkt->busFirstWordDelay = pkt->busLastWordDelay = 0;
@@ -548,6 +574,8 @@
                 mshr->threadNum = -1;
             }
             mshr->allocateTarget(pkt, time, order++);
+            DPRINTF(Cache, "Allocate a target in mshr queue for address: 0x%x, "
+                           "current size: %d\n", pkt->getAddr(), mshr->getNumTargets());
             if (mshr->getNumTargets() == numTarget) {
                 noTargetMSHR = mshr;
                 setBlocked(Blocked_NoTargets);
@@ -641,8 +669,6 @@
         return NULL;
     }
 
-    assert(cpu_pkt->needsResponse());
-
     MemCmd cmd;
     // @TODO make useUpgrades a parameter.
     // Note that ownership protocols require upgrade, otherwise a
@@ -947,6 +973,7 @@
 
         switch (target->source) {
           case MSHR::Target::FromCPU:
+            DPRINTF(Cache, "HandleResponse fromCPU\n");
             Tick completion_time;
             if (is_fill) {
                 satisfyCpuSideRequest(target->pkt, blk,
@@ -1012,8 +1039,9 @@
             break;
 
           case MSHR::Target::FromPrefetcher:
-            assert(target->pkt->cmd == MemCmd::HardPFReq);
-            if (blk)
+            DPRINTF(Cache, "HandleResponse fromPrefetcher\n");
+            assert(target->pkt->cmd.isHWPrefetch());
+                          if (blk)
                 blk->status |= BlkHWPrefetched;
             delete target->pkt->req;
             delete target->pkt;
@@ -1679,6 +1707,7 @@
     assert(!miss_mshr && !write_mshr);
     if (prefetcher && !mshrQueue.isFull()) {
         // If we have a miss queue slot, we can try a prefetch
+        DPRINTF(Cache, "Trying prefetch queue\n");
         PacketPtr pkt = prefetcher->getPacket();
         if (pkt) {
             Addr pf_addr = blockAlign(pkt->getAddr());
diff --git a/src/mem/cache/mshr.cc b/src/mem/cache/mshr.cc
--- a/src/mem/cache/mshr.cc
+++ b/src/mem/cache/mshr.cc
@@ -218,7 +218,7 @@
     assert(targets.isReset());
     // Don't know of a case where we would allocate a new MSHR for a
     // snoop (mem-side request), so set source according to request here
-    Target::Source source = (target->cmd == MemCmd::HardPFReq) ?
+    Target::Source source = (target->cmd.isHWPrefetch()) ?
         Target::FromPrefetcher : Target::FromCPU;
     targets.add(target, whenReady, _order, source, true);
     assert(deferredTargets.isReset());
diff --git a/src/mem/cache/prefetch/base.cc b/src/mem/cache/prefetch/base.cc
--- a/src/mem/cache/prefetch/base.cc
+++ b/src/mem/cache/prefetch/base.cc
@@ -187,6 +187,37 @@
 Tick
 BasePrefetcher::notify(PacketPtr &pkt, Tick tick)
 {
+    if (pkt->isStoreAccess()) {
+        //
+        //convert storeAccess to HardPFExReq
+        Addr blk_addr = pkt->getAddr() & ~(Addr)(blkSize-1);
+        Request *prefetchReq = new Request(blk_addr, blkSize, 0, masterId);
+        PacketPtr prefetch = new Packet(prefetchReq, MemCmd::HardPFExReq);
+
+        DPRINTF(HWPrefetch, "Add StoreAccess request "
+                            "with blk_addr: 0x%x to pf queue\n", blk_addr);
+
+        prefetch->allocate();
+        prefetch->req->setThreadContext(pkt->req->contextId(),
+                                        pkt->req->threadId());
+
+        // We just remove the head if we are full
+        if (pf.size() == size) {
+            pfRemovedFull++;
+            PacketPtr old_pkt = pf.begin()->pkt;
+            DPRINTF(HWPrefetch, "Prefetch queue full, removing oldest 0x%x\n",
+                                 old_pkt->getAddr());
+            delete old_pkt->req;
+            delete old_pkt;
+            pf.pop_front();
+        }
+
+        pf.push_back(DeferredPacket(tick, prefetch));
+
+        //return pf.empty() ? 0 : pf.front()->time;
+        return pf.empty() ? 0 : pf.front().tick;
+    }
+
     // Don't consult the prefetcher if any of the following conditons are true
     // 1) The request is uncacheable
     // 2) The request is a fetch, but we are only prefeching data
@@ -287,6 +318,8 @@
                 pf.pop_front();
             }
 
+            DPRINTF(HWPrefetch, "Add non StoreAccess request with blk_addr: "
+                                "0x%x to pf queue\n", *addrIter);
             pf.push_back(DeferredPacket(tick + clockPeriod() * *delayIter,
                                         prefetch));
         }
diff --git a/src/mem/coherent_bus.cc b/src/mem/coherent_bus.cc
--- a/src/mem/coherent_bus.cc
+++ b/src/mem/coherent_bus.cc
@@ -431,6 +431,7 @@
         // (corresponding to our own slave port that is also in
         // snoopPorts) and should not send it back to where it came
         // from
+
         if (exclude_slave_port_id == InvalidPortID ||
             p->getId() != exclude_slave_port_id) {
             // cache is not allowed to refuse snoop
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -89,6 +89,7 @@
         Writeback,
         SoftPFReq,
         HardPFReq,
+        HardPFExReq,
         SoftPFResp,
         HardPFResp,
         // WriteInvalidateReq transactions used to be generated by the
@@ -128,6 +129,7 @@
         PrintReq,       // Print state matching address
         FlushReq,      //request for a cache flush
         InvalidationReq,   // request for address to be invalidated from lsq
+        StoreAccess,	//exclusive permission
         NUM_MEM_CMDS
     };
 
@@ -152,6 +154,7 @@
         IsError,        //!< Error response
         IsPrint,        //!< Print state matching address (for debugging)
         IsFlush,        //!< Flush the address from caches
+        IsStoreAccess,  //!< exclusive permission
         NUM_COMMAND_ATTRIBUTES
     };
 
@@ -199,7 +202,8 @@
     bool isError() const        { return testCmdAttrib(IsError); }
     bool isPrint() const        { return testCmdAttrib(IsPrint); }
     bool isFlush() const        { return testCmdAttrib(IsFlush); }
-
+    bool isStoreAccess() const	{ return testCmdAttrib(IsStoreAccess); }
+    bool isHWPrefetch() const	{ return testCmdAttrib(IsHWPrefetch); }
     const Command
     responseCommand() const
     {
@@ -509,6 +513,7 @@
     bool isError() const        { return cmd.isError(); }
     bool isPrint() const        { return cmd.isPrint(); }
     bool isFlush() const        { return cmd.isFlush(); }
+    bool isStoreAccess() const	{ return cmd.isStoreAccess(); }
 
     // Snoop flags
     void assertMemInhibit()         { flags.set(MEM_INHIBIT); }
@@ -677,8 +682,14 @@
         // If this is a request packet for which there's no response,
         // delete the request object here, since the requester will
         // never get the chance.
-        if (req && isRequest() && !needsResponse())
+        if (req && isRequest() && !needsResponse()
+            && !cmd.isHWPrefetch()) {
+            //
+            //If HWExPrefetch, it doesn't need a response, hence already
+            //deletes req when getting the response in handleResponse function
+            //
             delete req;
+        }
         deleteData();
     }
 
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -92,6 +92,9 @@
     /* HardPFReq */
     { SET4(IsRead, IsRequest, IsHWPrefetch, NeedsResponse),
             HardPFResp, "HardPFReq" },
+    /* HardPFExReq */
+    { SET4(IsRead, IsRequest, IsHWPrefetch, NeedsExclusive),
+            InvalidCmd, "HardPFExReq" },
     /* SoftPFResp */
     { SET4(IsRead, IsResponse, IsSWPrefetch, HasData),
             InvalidCmd, "SoftPFResp" },
@@ -169,6 +172,9 @@
     /* Invalidation Request */
     { SET3(NeedsExclusive, IsInvalidate, IsRequest),
       InvalidCmd, "InvalidationReq" },
+    /* StoreAccess Request */
+    { SET3(NeedsExclusive, IsRequest, IsStoreAccess), InvalidCmd,
+           "StoreAccessReq" },
 };
 
 bool
diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm
--- a/src/mem/protocol/RubySlicc_Exports.sm
+++ b/src/mem/protocol/RubySlicc_Exports.sm
@@ -135,6 +135,7 @@
   COMMIT,            desc="Commit version";
   NULL,              desc="Invalid request type";
   FLUSH,             desc="Flush request type";
+  Store_Access,      desc="Store Permission Prefetch";
 }
 
 enumeration(SequencerRequestType, desc="...", default="SequencerRequestType_NULL") {
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -165,6 +165,7 @@
 
   protected:
     void ruby_hit_callback(PacketPtr pkt);
+    void rubyDoRetries();
     void testDrainComplete();
     void ruby_eviction_callback(const Address& address);
 
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -347,6 +347,14 @@
 
     slave_ports[pkt->getSrc()]->hitCallback(pkt);
 
+    rubyDoRetries();
+
+    testDrainComplete();
+}
+
+void
+RubyPort::rubyDoRetries()
+{
     //
     // If we had to stall the MemSlavePorts, wake them up because the sequencer
     // likely has free resources now.
@@ -370,8 +378,6 @@
             (*i)->sendRetry();
         }
     }
-
-    testDrainComplete();
 }
 
 void
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -358,18 +358,21 @@
     assert(i != m_RequestTable.end());
 
     SequencerRequest* request;
-    request = i->second->front();
-    assert((request->m_type == RubyRequestType_ST) ||
-           (request->m_type == RubyRequestType_ATOMIC) ||
-           (request->m_type == RubyRequestType_RMW_Read) ||
-           (request->m_type == RubyRequestType_RMW_Write) ||
-           (request->m_type == RubyRequestType_Load_Linked) ||
-           (request->m_type == RubyRequestType_Store_Conditional) ||
-           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
-           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
-           (request->m_type == RubyRequestType_FLUSH));
-
+    bool ruby_request = true;
     while (!i->second->empty()) {
+        request = i->second->front();
+        if (ruby_request) {
+            assert((request->m_type == RubyRequestType_Store_Access) ||
+                   (request->m_type == RubyRequestType_ST) ||
+                   (request->m_type == RubyRequestType_ATOMIC) ||
+                   (request->m_type == RubyRequestType_RMW_Read) ||
+                   (request->m_type == RubyRequestType_RMW_Write) ||
+                   (request->m_type == RubyRequestType_Load_Linked) ||
+                   (request->m_type == RubyRequestType_Store_Conditional) ||
+                   (request->m_type == RubyRequestType_Locked_RMW_Read) ||
+                   (request->m_type == RubyRequestType_Locked_RMW_Write) ||
+                   (request->m_type == RubyRequestType_FLUSH));
+        }
 
         // handle write request
         if ((request->m_type != RubyRequestType_LD) &&
@@ -381,7 +384,7 @@
             // Not valid for Network_test protocl
             //
             bool success = true;
-            if(!m_usingNetworkTester)
+            if (!m_usingNetworkTester)
                 success = handleLlsc(address, request);
             if (request->m_type == RubyRequestType_Locked_RMW_Read) {
                 m_controller->blockOnQueue(address, m_mandatory_q_ptr);
@@ -402,9 +405,7 @@
 
         i->second->pop_front();
         markRemoved();
-
-        // Process the next entry in the list
-        request = i->second->front();
+        ruby_request = false;
     }
 
     // free all outstanding requests corresponding to this address
@@ -430,11 +431,13 @@
     assert(i != m_RequestTable.end());
 
     SequencerRequest* request;
-    request = i->second->front();
-    assert((request->m_type == RubyRequestType_LD) ||
-           (request->m_type == RubyRequestType_IFETCH));
-
+    bool ruby_request = true;
     while (!i->second->empty()) {
+        request = i->second->front();
+        if (ruby_request) {
+            assert((request->m_type == RubyRequestType_LD) ||
+                   (request->m_type == RubyRequestType_IFETCH));
+        }
         if ((request->m_type != RubyRequestType_LD) &&
             (request->m_type != RubyRequestType_IFETCH)) {
 
@@ -447,9 +450,7 @@
                     firstResponseTime);
         i->second->pop_front();
         markRemoved();
-
-        // Process the next entry in the list
-        request = i->second->front();
+        ruby_request = false;
     }
 
     // free all outstanding requests corresponding to this address
@@ -538,7 +539,15 @@
         delete pkt;
         g_system_ptr->m_cache_recorder->enqueueNextFlushRequest();
     } else {
-        ruby_hit_callback(pkt);
+        if (type != RubyRequestType_Store_Access) {
+            ruby_hit_callback(pkt);
+        }
+        else {
+            // A storeAccess can block the CPU from issuing stores, so call
+            // rubyDoRetries so the CPU can reissue blocked stores.
+            rubyDoRetries();
+            DPRINTF(RubySequencer, "StoreAccess's response; skip forwarding to CPU\n");
+        }
     }
 }
 
@@ -557,8 +566,12 @@
 
     RubyRequestType primary_type = RubyRequestType_NULL;
     RubyRequestType secondary_type = RubyRequestType_NULL;
-
-    if (pkt->isLLSC()) {
+    if (pkt->isStoreAccess()) {
+        DPRINTF(RubySequencer, "Issuing Store Access\n");
+        primary_type = RubyRequestType_Store_Access;
+        secondary_type = RubyRequestType_ST;
+    }
+    else if (pkt->isLLSC()) {
         //
         // Alpha LL/SC instructions need to be handled carefully by the cache
         // coherence protocol to ensure they follow the proper semantics. In