diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -150,6 +150,7 @@
         IsUncacheable,
         ReqMade,
         MemOpDone,
+        CacheAccessDelayed,
         MaxFlags
     };
 
@@ -238,8 +239,9 @@
 
     /////////////////////// TLB Miss //////////////////////
     /**
-     * Saved memory requests (needed when the DTB address translation is
-     * delayed due to a hw page table walk).
+     * Saved memory requests (needed when (1) the DTB address translation is
+     * delayed due to a hw page table walk (2) load cannot be executed due
+     * to lack of cache ports).
      */
     RequestPtr savedReq;
     RequestPtr savedSreqLow;
@@ -357,6 +359,13 @@
         return (translationStarted() && !translationCompleted());
     }
 
+    /** True if the instruction was delayed because of lack of cache
+     * ports while sitting in the LSQ. If this is true, the instruction will
+     * be tried to execute again.
+     */
+    bool cacheAccessDelayed() const { return instFlags[CacheAccessDelayed]; }
+    void cacheAccessDelayed(bool f) { instFlags[CacheAccessDelayed] = f; }
+
   public:
 #ifdef DEBUG
     void dumpSNList();
diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py
--- a/src/cpu/o3/O3CPU.py
+++ b/src/cpu/o3/O3CPU.py
@@ -39,7 +39,9 @@
 
     activity = Param.Unsigned(0, "Initial count")
 
-    cachePorts = Param.Unsigned(200, "Cache Ports")
+    cacheReadPorts = Param.Unsigned(1, "Cache dedicated read ports")
+    cacheSharedPorts = Param.Unsigned(0, "Cache shared read/write ports")
+    cacheWritePorts = Param.Unsigned(1, "Cache dedicated write ports")
 
     decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay")
     renameToFetchDelay = Param.Cycles(1 ,"Rename to fetch delay")
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -1265,6 +1265,15 @@
                     continue;
                 }
 
+                //to make sure we don't load more than the available cache read ports
+                if (inst->cacheAccessDelayed() && fault == NoFault) {
+                    // Not enough cache read port now, we should wait
+                    DPRINTF(IEW, "Execute: Delayed cache Access, deferring "
+                            "load.\n");
+                    instQueue.deferMemAccessInst(inst);
+                    continue;
+                }
+
                 if (inst->isDataPrefetch() || inst->isInstPrefetch()) {
                     inst->fault = NoFault;
                 }
@@ -1482,6 +1491,9 @@
 void
 DefaultIEW<Impl>::tick()
 {
+    //To reset some lsq counter
+    ldstQueue.tick();
+
     wbNumInst = 0;
     wbCycle = 0;
 
@@ -1507,6 +1519,9 @@
     }
 
     if (exeStatus != Squashing) {
+        //load remaining split loads
+        ldstQueue.readPendingSplitLoad();
+
         executeInsts();
 
         writebackInsts();
diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh
--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -192,6 +192,11 @@
      */
     DynInstPtr getDeferredMemInstToExecute();
 
+    /** Returns a memory instruction that was deferred due to the lack of free
+     *  ports.
+     */
+    DynInstPtr getDeferredMemAccessInstToExecute();
+
     /**
      * Records the instruction as the producer of a register without
      * adding it to the rest of the IQ.
@@ -241,6 +246,11 @@
      */
     void deferMemInst(DynInstPtr &deferred_inst);
 
+    /**
+     * Defers a memory instruction when there is lack of free cache ports.
+     */
+    void deferMemAccessInst(DynInstPtr &deferred_inst);
+
     /** Indicates an ordering violation between a store and a load. */
     void violation(DynInstPtr &store, DynInstPtr &faulting_load);
 
@@ -307,6 +317,11 @@
      */
     std::list<DynInstPtr> deferredMemInsts;
 
+    /** List of instructions waiting for free cache ports. Their access to
+     *  cache was denied at least one time for lack of free cache ports.
+     */
+    std::list<DynInstPtr> deferredMemAccessInsts;
+
     /**
      * Struct for comparing entries to be added to the priority queue.
      * This gives reverse ordering to the instructions in terms of
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -410,6 +410,7 @@
     nonSpecInsts.clear();
     listOrder.clear();
     deferredMemInsts.clear();
+    deferredMemAccessInsts.clear();
 }
 
 template <class Impl>
@@ -740,6 +741,15 @@
         total_deferred_mem_issued++;
     }
 
+    //for adding mem instructions that were denied to access cache because of lack of free ports
+    //Should it be done before getDeferredMemInstToExecute()?
+    while (total_deferred_mem_issued < totalWidth &&
+           (deferred_mem_inst = getDeferredMemAccessInstToExecute()) != 0) {
+        issueToExecuteQueue->access(0)->size++;
+        instsToExecute.push_back(deferred_mem_inst);
+        total_deferred_mem_issued++;
+    }
+
     // Have iterator to head of the list
     // While I haven't exceeded bandwidth or reached the end of the list,
     // Try to get a FU that can do what this op needs.
@@ -873,7 +883,8 @@
     // @todo If the way deferred memory instructions are handeled due to
     // translation changes then the deferredMemInsts condition should be removed
     // from the code below.
-    if (total_issued || total_deferred_mem_issued || deferredMemInsts.size()) {
+    if (total_issued || total_deferred_mem_issued || deferredMemInsts.size()
+        || deferredMemAccessInsts.size()) {
         cpu->activityThisCycle();
     } else {
         DPRINTF(IQ, "Not able to schedule any instructions.\n");
@@ -1093,6 +1104,26 @@
 
 template <class Impl>
 void
+InstructionQueue<Impl>::deferMemAccessInst(DynInstPtr &deferred_inst)
+{
+    deferredMemAccessInsts.push_back(deferred_inst);
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+InstructionQueue<Impl>::getDeferredMemAccessInstToExecute()
+{
+    for (ListIt it = deferredMemAccessInsts.begin(); it != deferredMemAccessInsts.end();
+         ++it) {
+            DynInstPtr ret = *it;
+            deferredMemAccessInsts.erase(it);
+            return ret;
+    }
+    return NULL;
+}
+
+template <class Impl>
+void
 InstructionQueue<Impl>::violation(DynInstPtr &store,
                                   DynInstPtr &faulting_load)
 {
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -133,6 +133,11 @@
     void writebackStores(ThreadID tid);
 
     /**
+     * Attempts to read a pending split load.
+     */
+    void readPendingSplitLoad();
+
+    /**
      * Squash instructions from a thread until the specified sequence number.
      */
     void squash(const InstSeqNum &squashed_num, ThreadID tid)
@@ -311,6 +316,47 @@
     /** The IEW stage pointer. */
     IEW *iewStage;
 
+    /** The number of used cache read ports in this cycle. */
+    int usedReadPorts;
+
+    /** The number of used shared read/write ports in this cycle. */
+    int usedSharedPorts;
+
+    /** The number of used write ports in this cycle. */
+    int usedWritePorts;
+
+    /** The number of dedicated cache read ports available each cycle. */
+    int cacheReadPorts;
+
+    /** The number of shared cache read/write ports available each cycle. */
+    int cacheSharedPorts;
+
+    /** The number of dedicated cache write ports available each cycle. */
+    int cacheWritePorts;
+
+    /** Whether or not there is a split load packet (second packet) that
+     * could not be sent because of a lack of cache read request ports. */
+    bool hasPendingSplitPkt;
+
+    /** Whether or not there is a free cache read port. This checks both
+     * dedicated read ports and shared read/write ports, if any. */
+    bool freeCacheReadPorts();
+
+    /** Increment the number of used read ports with respect to dedeicated
+     * and shared ports. */
+    void incCacheReadPorts();
+
+    /** Whether or not there is a free cache write port. This checks both
+     * dedicated write ports and shared read/write ports, if any. */
+    bool freeCacheWritePorts();
+
+    /** Increment the number of used write ports with respect to dedeicated
+     * and shared ports. */
+    void incCacheWritePorts();
+
+    /** The thread Id of the split load that pending free cache read request ports. */
+    ThreadID pendingSplitThreadId;
+
   protected:
     /** The LSQ policy for SMT mode. */
     LSQPolicy lsqPolicy;
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -56,6 +56,9 @@
 template <class Impl>
 LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params)
     : cpu(cpu_ptr), iewStage(iew_ptr),
+      cacheReadPorts(params->cacheReadPorts),
+      cacheSharedPorts(params->cacheSharedPorts),
+      cacheWritePorts(params->cacheWritePorts),
       LQEntries(params->LQEntries),
       SQEntries(params->SQEntries),
       numThreads(params->numThreads),
@@ -114,6 +117,13 @@
                          maxLQEntries, maxSQEntries, tid);
         thread[tid].setDcachePort(&cpu_ptr->getDataPort());
     }
+
+    //Reset Counters
+    usedReadPorts = 0;
+    usedSharedPorts = 0;
+    usedWritePorts = 0;
+    hasPendingSplitPkt = false;
+    pendingSplitThreadId = -1;
 }
 
 
@@ -244,6 +254,12 @@
 void
 LSQ<Impl>::tick()
 {
+    //Reseting counter
+    //DPRINTF(LSQ, "LSQ tick\n");
+    usedReadPorts = 0;
+    usedSharedPorts = 0;
+    usedWritePorts = 0;
+
     list<ThreadID>::iterator threads = activeThreads->begin();
     list<ThreadID>::iterator end = activeThreads->end();
 
@@ -292,6 +308,60 @@
 
 template<class Impl>
 void
+LSQ<Impl>::readPendingSplitLoad()
+{
+    if (hasPendingSplitPkt) {
+        assert(pendingSplitThreadId >= 0);
+        thread[pendingSplitThreadId].readPendingSplitLoad();
+        hasPendingSplitPkt = false;
+        pendingSplitThreadId = -1;
+    }
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::freeCacheReadPorts()
+{
+    return ((usedReadPorts < cacheReadPorts) ||
+            (usedSharedPorts < cacheSharedPorts));
+}
+
+template<class Impl>
+void
+LSQ<Impl>::incCacheReadPorts()
+{
+    assert(freeCacheReadPorts());
+    if (usedReadPorts < cacheReadPorts)
+        ++usedReadPorts;
+    else { //use shared ports
+        assert(usedSharedPorts < cacheSharedPorts);
+        ++usedSharedPorts;
+    }
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::freeCacheWritePorts()
+{
+    return ((usedWritePorts < cacheWritePorts) ||
+            (usedSharedPorts < cacheSharedPorts));
+}
+
+template<class Impl>
+void
+LSQ<Impl>::incCacheWritePorts()
+{
+    assert(freeCacheWritePorts());
+    if (usedWritePorts < cacheWritePorts)
+        ++usedWritePorts;
+    else { //use shared ports
+        assert(usedSharedPorts < cacheSharedPorts);
+        ++usedSharedPorts;
+    }
+}
+
+template<class Impl>
+void
 LSQ<Impl>::writebackStores()
 {
     list<ThreadID>::iterator threads = activeThreads->begin();
@@ -347,9 +417,16 @@
 bool
 LSQ<Impl>::recvTimingResp(PacketPtr pkt)
 {
+    assert(pkt->isResponse());
+
+    DPRINTF(LSQ, "received pkt for addr:%#x %s, isLLSC: %d, isRead: %d, "
+            "isWrite: %d, isReadWrite: %d\n", pkt->getAddr(), pkt->cmdString(),
+            pkt->isLLSC(), pkt->isRead(), pkt->isWrite(), pkt->isReadWrite());
+
     if (pkt->isError())
         DPRINTF(LSQ, "Got error packet back for address: %#X\n",
                 pkt->getAddr());
+
     thread[pkt->req->threadId()].completeDataAccess(pkt);
     return true;
 }
@@ -358,8 +435,10 @@
 void
 LSQ<Impl>::recvTimingSnoopReq(PacketPtr pkt)
 {
-    DPRINTF(LSQ, "received pkt for addr:%#x %s\n", pkt->getAddr(),
-            pkt->cmdString());
+    assert(pkt->isRequest());
+    DPRINTF(LSQ, "received snoop pkt for addr:%#x %s, isLLSC: %d, isRead: %d, "
+            "isWrite: %d, isReadWrite: %d\n", pkt->getAddr(), pkt->cmdString(),
+            pkt->isLLSC(), pkt->isRead(), pkt->isWrite(), pkt->isReadWrite());
 
     // must be a snoop
     if (pkt->isInvalidate()) {
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -61,6 +61,7 @@
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "sim/fault_fwd.hh"
+#include "sim/eventq.hh"
 
 struct DerivO3CPUParams;
 
@@ -114,7 +115,7 @@
      * @todo: Move the number of used ports up to the LSQ level so it can
      * be shared by all LSQ units.
      */
-    void tick() { usedPorts = 0; }
+    void tick() { }
 
     /** Inserts an instruction. */
     void insert(DynInstPtr &inst);
@@ -486,6 +487,13 @@
     /** The packet that is pending free cache ports. */
     PacketPtr pendingPkt;
 
+    /** The split load packet that pending free cache read request ports. */
+    PacketPtr pendingSplitPkt;
+
+    /** The load state of pending split packet that pending free cache read
+     * request ports. */
+    LSQSenderState *pendingSplitState;
+
     /** Flag for memory model. */
     bool needsTSO;
 
@@ -523,6 +531,14 @@
     /** Number of times the LSQ is blocked due to the cache. */
     Stats::Scalar lsqCacheBlocked;
 
+    /** Number of times the load request execution in the LSQ is denied due to
+     * lack of free read cache ports. */
+    Stats::Scalar cacheReadReqPortBlocked;
+
+    /** Number of times the store request execution in the LSQ is denied due to
+     * lack of free write cache ports. */
+    Stats::Scalar cacheWriteReqPortBlocked;
+
   public:
     /** Executes the load at the given index. */
     Fault read(Request *req, Request *sreqLow, Request *sreqHigh,
@@ -560,6 +576,11 @@
 
     /** Returns whether or not the LSQ unit is stalled. */
     bool isStalled()  { return stalled; }
+
+    /** Does a read for a split load that couldn't be completed
+     * the previous cycle. */
+    void readPendingSplitLoad();
+
 };
 
 template <class Impl>
@@ -646,6 +667,9 @@
             delete fst_data_pkt;
             delete snd_data_pkt;
         }
+        DPRINTF(LSQUnit, "Scheduling writeback event for load [sn:%lli] "
+               "PC %s due to MmappedIpr\n",
+                load_inst->seqNum, load_inst->pcState());
         WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
         cpu->schedule(wb, cpu->clockEdge(delay));
         return NoFault;
@@ -707,7 +731,14 @@
             PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq);
             data_pkt->dataStatic(load_inst->memData);
 
+
+            DPRINTF(LSQUnit, "Scheduling writeback event for load [sn:%lli] "
+                    "PC %s due to store to load forwarding\n",
+                    load_inst->seqNum, load_inst->pcState());
+
             WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
+            //To disable cacheAccessDelayed
+            load_inst->cacheAccessDelayed(false);
 
             // We'll say this has a 1 cycle load-store forwarding latency
             // for now.
@@ -751,6 +782,8 @@
             iewStage->rescheduleMemInst(load_inst);
             iewStage->decrWb(load_inst->seqNum);
             load_inst->clearIssued();
+            //Disable cache access delayed
+            load_inst->cacheAccessDelayed(false);
             ++lsqRescheduledLoads;
 
             // Do not generate a writeback event as this instruction is not
@@ -773,14 +806,30 @@
     }
 
     // If there's no forwarding case, then go access memory
+
+    /*Checking for free read ports before accessing memory */
+    if (!lsq->freeCacheReadPorts()) {
+        ++cacheReadReqPortBlocked;
+        DPRINTF(LSQUnit, "no free cache read ports to send load requests.\n");
+        //NOTE: this instr does not lookup TLB any more, because
+        //translationStarted and translationCompleted are already set.
+
+        load_inst->cacheAccessDelayed(true);
+        load_inst->savedReq = req;
+        load_inst->savedSreqLow = sreqLow;
+        load_inst->savedSreqHigh = sreqHigh;
+        return NoFault;
+    }
+
+    load_inst->cacheAccessDelayed(false);
+    /*End checking */
+
     DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n",
             load_inst->seqNum, load_inst->pcState());
 
     assert(!load_inst->memData);
     load_inst->memData = new uint8_t[64];
 
-    ++usedPorts;
-
     // if we the cache is not blocked, do cache access
     bool completedFirst = false;
     if (!lsq->cacheBlocked()) {
@@ -819,7 +868,13 @@
             state->mainPkt = data_pkt;
         }
 
+        DPRINTF(LSQUnit, "sendTiming pkt for addr:%#x\n",
+                fst_data_pkt->getAddr());
+        lsq->incCacheReadPorts();
+
         if (!dcachePort->sendTimingReq(fst_data_pkt)) {
+            DPRINTF(LSQUnit, "sendTiming pkt for addr:%#x failed!\n",
+                    fst_data_pkt->getAddr());
             // Delete state and data packet because a load retry
             // initiates a pipeline restart; it does not retry.
             delete state;
@@ -847,19 +902,39 @@
             // load will be squashed, so indicate this to the state object.
             // The first packet will return in completeDataAccess and be
             // handled there.
-            ++usedPorts;
-            if (!dcachePort->sendTimingReq(snd_data_pkt)) {
 
-                // The main packet will be deleted in completeDataAccess.
-                delete snd_data_pkt->req;
-                delete snd_data_pkt;
+            DPRINTF(LSQUnit, "sendTiming pkt for second addr (split):%#x\n",
+                    snd_data_pkt->getAddr());
+            if (lsq->freeCacheReadPorts()) {
+                lsq->incCacheReadPorts();
+                if (!dcachePort->sendTimingReq(snd_data_pkt)) {
 
-                state->complete();
+                    DPRINTF(LSQUnit, "sendTiming pkt for second addr (split):%#x failed!\n", snd_data_pkt->getAddr());
+                    // The main packet will be deleted in completeDataAccess.
+                    delete snd_data_pkt->req;
+                    delete snd_data_pkt;
 
-                req = NULL;
-                sreqHigh = NULL;
+                    state->complete();
 
-                lsq->setRetryTid(lsqID);
+                    req = NULL;
+                    sreqHigh = NULL;
+
+                    lsq->setRetryTid(lsqID);
+                }
+            } else {
+                //no free cache read ports
+                ++cacheReadReqPortBlocked;
+
+                //We save the packet to be sent next cycle
+                //There should be only one pending split packet for all
+                //threads, since once cache is blocked for one thread, all
+                //other threads should not be able to do more loads.
+                assert(!lsq->cacheBlocked());
+                assert(pendingSplitPkt == NULL);
+                pendingSplitPkt = snd_data_pkt;
+                pendingSplitState = state;
+                lsq->hasPendingSplitPkt = true;
+                lsq->pendingSplitThreadId = load_inst->threadNumber;
             }
         }
     }
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -107,6 +107,9 @@
         iewStage->decrWb(inst->seqNum);
     } else {
         if (!state->noWB) {
+            DPRINTF(LSQUnit, "Complete DataAccess for load: %d, store: %d, "
+                    "PC %s, [sn:%lli]\n", inst->isLoad(), inst->isStore(),
+                    inst->pcState(), inst->seqNum);
             if (!TheISA::HasUnalignedMemAcc || !state->isSplit ||
                 !state->isLoad) {
                 writeback(inst, pkt);
@@ -116,6 +119,8 @@
         }
 
         if (inst->isStore()) {
+            DPRINTF(LSQUnit, "Complete DataAccess for store PC %s, [sn:%lli]\n",
+                    inst->pcState(), inst->seqNum);
             completeStore(state->idx);
         }
     }
@@ -133,7 +138,8 @@
 LSQUnit<Impl>::LSQUnit()
     : loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false),
       isStoreBlocked(false), isLoadBlocked(false),
-      loadBlockedHandled(false), storeInFlight(false), hasPendingPkt(false)
+      loadBlockedHandled(false), storeInFlight(false), hasPendingPkt(false),
+      pendingSplitPkt(NULL)
 {
 }
 
@@ -161,7 +167,6 @@
 
     depCheckShift = params->LSQDepCheckShift;
     checkLoads = params->LSQCheckLoads;
-    cachePorts = params->cachePorts;
     needsTSO = params->needsTSO;
 
     resetState();
@@ -178,7 +183,10 @@
 
     storeHead = storeWBIdx = storeTail = 0;
 
-    usedPorts = 0;
+    //Reset Counters
+    lsq->usedReadPorts = 0;
+    lsq->usedSharedPorts = 0;
+    lsq->usedWritePorts = 0;
 
     retryPkt = NULL;
     memDepViolator = NULL;
@@ -246,6 +254,14 @@
     lsqCacheBlocked
         .name(name() + ".cacheBlocked")
         .desc("Number of times an access to memory failed due to the cache being blocked");
+
+    cacheReadReqPortBlocked
+        .name(name() + ".cacheReadReqPortBlocked")
+        .desc("Number of times an access to cache failed due to the lack of free cache read ports for load requests");
+
+    cacheWriteReqPortBlocked
+        .name(name() + ".cacheWriteReqPortBlocked")
+        .desc("Number of times an access to cache failed due to the lack of free cache write ports for store requests");
 }
 
 template<class Impl>
@@ -492,7 +508,8 @@
      */
     while (load_idx != loadTail) {
         DynInstPtr ld_inst = loadQueue[load_idx];
-        if (!ld_inst->effAddrValid() || ld_inst->uncacheable()) {
+        if (!ld_inst->effAddrValid() || ld_inst->uncacheable()
+            || ld_inst->cacheAccessDelayed()) {
             incrLdIdx(load_idx);
             continue;
         }
@@ -576,6 +593,13 @@
         load_fault == NoFault)
         return load_fault;
 
+    if (inst->cacheAccessDelayed() &&
+        load_fault == NoFault) {
+        DPRINTF(LSQUnit, "load PC %s, [sn:%lli] was delayed due to lack of "
+                "free cache ports\n", inst->pcState(), inst->seqNum);
+        return load_fault;
+    }
+
     // If the instruction faulted or predicated false, then we need to send it
     // along to commit without the instruction completing.
     if (load_fault != NoFault || inst->readPredicate() == false) {
@@ -718,12 +742,45 @@
 
 template <class Impl>
 void
+LSQUnit<Impl>::readPendingSplitLoad()
+{
+    //since this is the first packet we send this cycle, all dedicated cache
+    //read ports and shared read/write ports should be available.
+    assert(lsq->usedReadPorts == 0 && lsq->usedSharedPorts == 0);
+    assert(lsq->hasPendingSplitPkt);
+    assert(pendingSplitPkt != NULL);
+    lsq->incCacheReadPorts();
+
+    if (!dcachePort->sendTimingReq(pendingSplitPkt)) {
+        DPRINTF(LSQUnit, "sendTiming pkt for second addr (split):%#x failed!\n", pendingSplitPkt->getAddr());
+        // The main packet will be deleted in completeDataAccess.
+        delete pendingSplitPkt->req;
+        delete pendingSplitPkt;
+
+        pendingSplitState->complete();
+
+        //retryId should be -1 here.
+        lsq->setRetryTid(lsqID);
+    }
+
+    pendingSplitPkt = NULL;
+    pendingSplitState = NULL;
+}
+
+template <class Impl>
+void
 LSQUnit<Impl>::writebackPendingStore()
 {
+    //since this is the first packet we send this cycle, all dedicated cache
+    //write ports should be available, but shared ports might have already
+    //been used by load instr.
+    assert(lsq->usedWritePorts == 0);
+
     if (hasPendingPkt) {
         assert(pendingPkt != NULL);
 
         // If the cache is blocked, this will store the packet for retry.
+        DPRINTF(LSQUnit, "write back a pending store\n");
         if (sendStore(pendingPkt)) {
             storePostSend(pendingPkt);
         }
@@ -746,8 +803,16 @@
            storeWBIdx != storeTail &&
            storeQueue[storeWBIdx].inst &&
            storeQueue[storeWBIdx].canWB &&
-           ((!needsTSO) || (!storeInFlight)) &&
-           usedPorts < cachePorts) {
+           ((!needsTSO) || (!storeInFlight))
+           ) {
+
+        //Check to see if there is enough port to send this instr
+        if (!lsq->freeCacheWritePorts()) {
+            DPRINTF(LSQUnit, "Unable to write back any more stores, "
+            "no more free cache write ports!\n");
+            ++cacheWriteReqPortBlocked;
+            break;
+        }
 
         if (isStoreBlocked || lsq->cacheBlocked()) {
             DPRINTF(LSQUnit, "Unable to write back any more stores, cache"
@@ -765,8 +830,6 @@
             continue;
         }
 
-        ++usedPorts;
-
         if (storeQueue[storeWBIdx].inst->isDataPrefetch()) {
             incrStIdx(storeWBIdx);
 
@@ -899,14 +962,12 @@
                 state->pendingPacket = snd_data_pkt;
             }
         } else {
-
             // If split, try to send the second packet too
             if (split) {
                 assert(snd_data_pkt);
 
                 // Ensure there are enough ports to use.
-                if (usedPorts < cachePorts) {
-                    ++usedPorts;
+                if (lsq->freeCacheWritePorts()) {
                     if (sendStore(snd_data_pkt)) {
                         storePostSend(snd_data_pkt);
                     } else {
@@ -915,9 +976,11 @@
                                 inst->seqNum);
                     }
                 } else {
-
+                    DPRINTF(LSQUnit, "Unable to write back split pkt, "
+                            "no more free cache write ports!\n");
+                    ++cacheWriteReqPortBlocked;
                     // Store the packet for when there's free ports.
-                    assert(pendingPkt == NULL);
+                    assert(!hasPendingPkt);
                     pendingPkt = snd_data_pkt;
                     hasPendingPkt = true;
                 }
@@ -929,9 +992,6 @@
         }
     }
 
-    // Not sure this should set it to 0.
-    usedPorts = 0;
-
     assert(stores >= 0 && storesToWB >= 0);
 }
 
@@ -1087,6 +1147,8 @@
 
     // Squashed instructions do not need to complete their access.
     if (inst->isSquashed()) {
+        DPRINTF(LSQUnit,"Writeback: squashed instruction PC %s, "
+                "[sn:%lli]\n", inst->pcState(), inst->seqNum);
         iewStage->decrWb(inst->seqNum);
         assert(!inst->isStore());
         ++lsqIgnoredResponses;
@@ -1094,6 +1156,8 @@
     }
 
     if (!inst->isExecuted()) {
+        DPRINTF(LSQUnit,"Writeback: non-executed instruction PC %s, "
+                "[sn:%lli]\n", inst->pcState(), inst->seqNum);
         inst->setExecuted();
 
         // Complete access to copy data to proper place.
@@ -1165,6 +1229,9 @@
 bool
 LSQUnit<Impl>::sendStore(PacketPtr data_pkt)
 {
+    //one write port is going to be used
+    lsq->incCacheWritePorts();
+
     if (!dcachePort->sendTimingReq(data_pkt)) {
         // Need to handle becoming blocked on a store.
         isStoreBlocked = true;
@@ -1188,6 +1255,9 @@
         LSQSenderState *state =
             dynamic_cast<LSQSenderState *>(retryPkt->senderState);
 
+        //Write ports are used here at this tick and they are reset in
+        //tick(). so they might be used again in the same tick.
+
         if (dcachePort->sendTimingReq(retryPkt)) {
             // Don't finish the store unless this is the last packet.
             if (!TheISA::HasUnalignedMemAcc || !state->pktToSend ||
@@ -1202,8 +1272,20 @@
             // Send any outstanding packet.
             if (TheISA::HasUnalignedMemAcc && state->pktToSend) {
                 assert(state->pendingPacket);
-                if (sendStore(state->pendingPacket)) {
-                    storePostSend(state->pendingPacket);
+                //check for free ports
+                if (lsq->freeCacheWritePorts()) {
+                    if (sendStore(state->pendingPacket)) {
+                        storePostSend(state->pendingPacket);
+                    }
+                } else { //No free cache write port
+                    DPRINTF(LSQUnit, "Unable to write back any more stores "
+                            "when recieved a retry, no more free cache write "
+                            "ports!\n");
+                    ++cacheWriteReqPortBlocked;
+                    // Store the packet for when there's free ports.
+                    assert(!hasPendingPkt);
+                    pendingPkt = state->pendingPacket;
+                    hasPendingPkt = true;
                 }
             }
         } else {
diff --git a/src/mem/cache/BaseCache.py b/src/mem/cache/BaseCache.py
--- a/src/mem/cache/BaseCache.py
+++ b/src/mem/cache/BaseCache.py
@@ -70,3 +70,6 @@
     mem_side = MasterPort("Port on side closer to MEM")
     addr_ranges = VectorParam.AddrRange([AllMemory], "The address range for the CPU-side port")
     system = Param.System(Parent.any, "System we belong to")
+    cacheReadPorts = Param.Int(-1, "# of dedicated read ports")
+    cacheSharedPorts = Param.Int(-1, "# of shared read/write ports")
+    cacheWritePorts = Param.Int(-1, "# of dedicated write ports")
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -171,7 +171,8 @@
       protected:
 
         CacheSlavePort(const std::string &_name, BaseCache *_cache,
-                       const std::string &_label);
+                       const std::string &_label, int _cacheReadPorts,
+                       int _cacheSharedPorts, int _cacheWritePorts);
 
         /** A normal packet queue used to store responses. */
         SlavePacketQueue queue;
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -57,8 +57,12 @@
 
 BaseCache::CacheSlavePort::CacheSlavePort(const std::string &_name,
                                           BaseCache *_cache,
-                                          const std::string &_label)
-    : QueuedSlavePort(_name, _cache, queue), queue(*_cache, *this, _label),
+                                          const std::string &_label,
+                                          int _cacheReadPorts,
+                                          int _cacheSharedPorts,
+                                          int _cacheWritePorts)
+    : QueuedSlavePort(_name, _cache, queue), queue(*_cache, *this, _label,
+      _cacheReadPorts, _cacheSharedPorts, _cacheWritePorts),
       blocked(false), mustSendRetry(false), sendRetryEvent(this)
 {
 }
@@ -101,7 +105,8 @@
         DPRINTF(CachePort, "Cache port %s sending retry\n", name());
         mustSendRetry = false;
         // @TODO: need to find a better time (next bus cycle?)
-        owner.schedule(sendRetryEvent, curTick() + 1);
+        //Use owner's nextCycle, better than curTick()+1.
+        owner.schedule(sendRetryEvent, owner.nextCycle());
     }
 }
 
diff --git a/src/mem/cache/cache.hh b/src/mem/cache/cache.hh
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -107,7 +107,8 @@
       public:
 
         CpuSidePort(const std::string &_name, Cache<TagStore> *_cache,
-                    const std::string &_label);
+                    const std::string &_label, int _cacheReadPorts,
+                    int _cacheSharedPorts, int _cacheWritePorts);
 
     };
 
@@ -128,7 +129,8 @@
 
         MemSidePacketQueue(Cache<TagStore> &cache, MasterPort &port,
                            const std::string &label) :
-            MasterPacketQueue(cache, port, label), cache(cache) { }
+            MasterPacketQueue(cache, port, label),
+                              cache(cache) { }
 
         /**
          * Override the normal sendDeferredPacket and do not only
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -73,10 +73,11 @@
     tempBlock->data = new uint8_t[blkSize];
 
     cpuSidePort = new CpuSidePort(p->name + ".cpu_side", this,
-                                  "CpuSidePort");
+                                  "CpuSidePort",
+                                  p->cacheReadPorts, p->cacheSharedPorts,
+                                  p->cacheWritePorts);
     memSidePort = new MemSidePort(p->name + ".mem_side", this,
                                   "MemSidePort");
-
     tags->setCache(this);
     if (prefetcher)
         prefetcher->setCache(this);
@@ -1711,8 +1712,11 @@
 template<class TagStore>
 Cache<TagStore>::
 CpuSidePort::CpuSidePort(const std::string &_name, Cache<TagStore> *_cache,
-                         const std::string &_label)
-    : BaseCache::CacheSlavePort(_name, _cache, _label), cache(_cache)
+                         const std::string &_label, int _cacheReadPorts,
+                         int _cacheSharedPorts, int _cacheWritePorts)
+    : BaseCache::CacheSlavePort(_name, _cache, _label, _cacheReadPorts,
+                                _cacheSharedPorts, _cacheWritePorts),
+                                cache(_cache)
 {
 }
 
diff --git a/src/mem/packet_queue.hh b/src/mem/packet_queue.hh
--- a/src/mem/packet_queue.hh
+++ b/src/mem/packet_queue.hh
@@ -247,6 +247,35 @@
 
     SlavePort& slavePort;
 
+  private:
+    /** counters used for keeping the number of used ports in each cycle */
+    int usedReadPorts;
+    int usedSharedPorts;
+    int usedWritePorts;
+
+    // number of dedicated l1 cache read ports
+    const int cacheReadPorts;
+    // number of shared l1 cache read/write ports
+    const int cacheSharedPorts;
+    // number of dedicated l1 cache write ports
+    const int cacheWritePorts;
+
+    // used only to reset port counters at the right time
+    Tick nextSendCycle;
+
+    /* Whether or not there is any free cache read ports remaining
+     * among all dedicated and shared ports. */
+    bool freeCacheReadPorts();
+    /* Whether or not there is any free cache write ports remaining
+     * among all dedicated and shared ports. */
+    bool freeCacheWritePorts();
+    /* Increment the number of used read ports wrt dedicated and shared
+     * ports. */
+    void incCacheReadPorts();
+    /* Increment the number of used write ports wrt dedicated and shared
+     * ports. */
+    void incCacheWritePorts();
+
   public:
 
     /**
@@ -259,7 +288,9 @@
      * @param _label Label to push on the label stack for print request packets
      */
     SlavePacketQueue(EventManager& _em, SlavePort& _slavePort,
-                     const std::string _label = "SlavePacketQueue");
+                     const std::string _label = "SlavePacketQueue",
+                     int cacheReadPorts = -1, int cacheSharedPorts = -1,
+                     int cacheWritePorts = -1);
 
     virtual ~SlavePacketQueue() { }
 
diff --git a/src/mem/packet_queue.cc b/src/mem/packet_queue.cc
--- a/src/mem/packet_queue.cc
+++ b/src/mem/packet_queue.cc
@@ -45,6 +45,8 @@
 #include "debug/Drain.hh"
 #include "debug/PacketQueue.hh"
 #include "mem/packet_queue.hh"
+#include "sim/clocked_object.hh"
+#include "mem/mem_object.hh"
 
 using namespace std;
 
@@ -228,6 +230,44 @@
 }
 
 bool
+SlavePacketQueue::freeCacheReadPorts()
+{
+    return ((usedReadPorts < cacheReadPorts) ||
+           (usedSharedPorts < cacheSharedPorts));
+}
+
+bool
+SlavePacketQueue::freeCacheWritePorts()
+{
+    return ((usedWritePorts < cacheWritePorts) ||
+           (usedSharedPorts < cacheSharedPorts));
+}
+
+void
+SlavePacketQueue::incCacheReadPorts()
+{
+    assert(freeCacheReadPorts());
+    if (usedReadPorts < cacheReadPorts)
+        ++usedReadPorts;
+    else { //we should use shared ports
+        assert(usedSharedPorts < cacheSharedPorts);
+        ++usedSharedPorts;
+    }
+}
+
+void
+SlavePacketQueue::incCacheWritePorts()
+{
+    assert(freeCacheWritePorts());
+    if (usedWritePorts < cacheWritePorts)
+        ++usedWritePorts;
+    else { //we should use shared ports
+        assert(usedSharedPorts < cacheSharedPorts);
+        ++usedSharedPorts;
+    }
+}
+
+bool
 MasterPacketQueue::sendTiming(PacketPtr pkt, bool send_as_snoop)
 {
     // attempt to send the packet and return according to the outcome
@@ -238,9 +278,18 @@
 }
 
 SlavePacketQueue::SlavePacketQueue(EventManager& _em, SlavePort& _slavePort,
-                                   const std::string _label)
-    : PacketQueue(_em, _label), slavePort(_slavePort)
+                                   const std::string _label,
+                                   int _cacheReadPorts,
+                                   int _cacheSharedPorts,
+                                   int _cacheWritePorts)
+    : PacketQueue(_em, _label), slavePort(_slavePort),
+    cacheReadPorts(_cacheReadPorts), cacheSharedPorts(_cacheSharedPorts),
+    cacheWritePorts(_cacheWritePorts)
 {
+    nextSendCycle = 0;
+    usedReadPorts = 0;
+    usedSharedPorts = 0;
+    usedWritePorts = 0;
 }
 
 bool
@@ -248,5 +297,52 @@
 {
     // we should never have queued snoop requests
     assert(!send_as_snoop);
-    return slavePort.sendTimingResp(pkt);
+
+    //if # of ports is not bounded
+    if (cacheReadPorts < 0)
+        return slavePort.sendTimingResp(pkt);
+
+    //if # of ports is bounded
+    //This is stupid, but we don't have tick() function to reset counters
+    //we could schedule an event to do it.
+    if (curTick() >= nextSendCycle) {
+        nextSendCycle = slavePort.getOwner().nextIncomingCycle();
+        usedReadPorts = 0;
+        usedSharedPorts = 0;
+        usedWritePorts = 0;
+    }
+
+    // attempt to send the packet and return according to the outcome
+    if (pkt->isRead()) {
+        if (freeCacheReadPorts()) {
+            incCacheReadPorts();
+            DPRINTF(PacketQueue, "sending read resp pkt %#x\n", pkt->getAddr());
+            return slavePort.sendTimingResp(pkt);
+        } else { //no free cache read port
+            //we schedule a send timing for next cycle
+            DPRINTF(PacketQueue, "Scheduling read resp pkt %#x for tick %lli "
+                    "due to lack of free ports\n", pkt->getAddr(),
+                    nextSendCycle);
+            schedSendTiming(pkt, nextSendCycle);
+            //return true which means the other side is not blocked
+            //so we don't expect any recvRetry and no waitingOnRetry is set
+            return true;
+        }
+    }
+    else if (pkt->isWrite()) {
+       if (freeCacheWritePorts()) {
+            incCacheWritePorts();
+            DPRINTF(PacketQueue, "sending write resp pkt %#x\n",
+                    pkt->getAddr());
+            return slavePort.sendTimingResp(pkt);
+        } else { //no free cache write port
+            DPRINTF(PacketQueue, "Scheduling write resp pkt %#x for tick %lli "
+                    "due to lack of free ports\n", pkt->getAddr(),
+                    nextSendCycle);
+            schedSendTiming(pkt, nextSendCycle);
+            return true;
+        }
+    } else {
+        panic("Unknown packet type!");
+    }
 }
diff --git a/src/mem/port.hh b/src/mem/port.hh
--- a/src/mem/port.hh
+++ b/src/mem/port.hh
@@ -114,6 +114,9 @@
     /** Get the port id. */
     PortID getId() const { return id; }
 
+    /** Get the port owner. */
+    MemObject& getOwner();
+
 };
 
 /** Forward declaration */
diff --git a/src/mem/port.cc b/src/mem/port.cc
--- a/src/mem/port.cc
+++ b/src/mem/port.cc
@@ -59,6 +59,12 @@
 {
 }
 
+MemObject&
+Port::getOwner()
+{
+    return owner;
+}
+
 BaseMasterPort::BaseMasterPort(const std::string& name, MemObject* owner,
                                PortID _id)
     : Port(name, *owner, _id), _baseSlavePort(NULL)
diff --git a/src/sim/clocked_object.hh b/src/sim/clocked_object.hh
--- a/src/sim/clocked_object.hh
+++ b/src/sim/clocked_object.hh
@@ -180,6 +180,18 @@
     Tick nextCycle() const
     { return clockEdge(); }
 
+    /**
+     * Since nextCycle() returns the current clock edge if it is called at
+     * clock edge, nextIncomingCycle() returns the next incoming clock edge.
+     */
+    Tick nextIncomingCycle() const
+    {
+        Tick clockEdgeTick = clockEdge();
+        if (clockEdgeTick == curTick())
+            return clockEdgeTick + clock;
+        return clockEdgeTick;
+    }
+
     inline uint64_t frequency() const { return SimClock::Frequency / clock; }
 
     inline Tick clockPeriod() const { return clock; }