diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -83,6 +83,7 @@
   public:
     // pointer to buffer for storing function arguments
     uint8_t *mem;
+    int wfSize;
     // size of function args
     int funcArgsSizePerItem;
 
@@ -90,13 +91,13 @@
     int
     getLaneOffset(int lane, int addr)
     {
-        return addr * VSZ + sizeof(CType) * lane;
+        return addr * wfSize + sizeof(CType) * lane;
     }
 
-    CallArgMem(int func_args_size_per_item)
-      : funcArgsSizePerItem(func_args_size_per_item)
+    CallArgMem(int func_args_size_per_item, int wf_size)
+        : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
     {
-        mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+        mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
     }
 
     ~CallArgMem()
@@ -192,9 +193,9 @@
     bool isOldestInstALU();
     bool isOldestInstBarrier();
     // used for passing spill address to DDInstGPU
-    uint64_t last_addr[VSZ];
-    uint32_t workitemid[3][VSZ];
-    uint32_t workitemFlatId[VSZ];
+    std::vector<Addr> last_addr;
+    std::vector<uint32_t> workitemid[3];
+    std::vector<uint32_t> workitemFlatId;
     uint32_t workgroupid[3];
     uint32_t workgroupsz[3];
     uint32_t gridsz[3];
@@ -230,14 +231,14 @@
     uint32_t startVgprIndex;
 
     // Old value of destination gpr (for trace)
-    uint32_t old_vgpr[VSZ];
+    std::vector<uint32_t> old_vgpr;
     // Id of destination gpr (for trace)
     uint32_t old_vgpr_id;
     // Tick count of last old_vgpr copy
     uint64_t old_vgpr_tcnt;
 
     // Old value of destination gpr (for trace)
-    uint64_t old_dgpr[VSZ];
+    std::vector<uint64_t> old_dgpr;
     // Id of destination gpr (for trace)
     uint32_t old_dgpr_id;
     // Tick count of last old_vgpr copy
@@ -247,7 +248,7 @@
     VectorMask init_mask;
 
     // number of barriers this WF has joined
-    int bar_cnt[VSZ];
+    std::vector<int> bar_cnt;
     int max_bar_cnt;
     // Flag to stall a wave on barrier
     bool stalledAtBarrier;
@@ -296,9 +297,9 @@
     // argument memory for hsail call instruction
     CallArgMem *callArgMem;
     void
-    initCallArgMem(int func_args_size_per_item)
+    initCallArgMem(int func_args_size_per_item, int wf_size)
     {
-        callArgMem = new CallArgMem(func_args_size_per_item);
+        callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
     }
 
     template<typename CType>
@@ -327,7 +328,6 @@
     }
 
     void start(uint64_t _wfDynId, uint64_t _base_ptr);
-
     void exec();
     void updateResources();
     int ready(itype_e type);
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -55,7 +55,6 @@
     last_trace = 0;
     simdId = p->simdId;
     wfSlotId = p->wf_slot_id;
-
     status = S_STOPPED;
     reservedVectorRegs = 0;
     startVgprIndex = 0;
@@ -77,12 +76,21 @@
     mem_trace_busy = 0;
     old_vgpr_tcnt = 0xffffffffffffffffll;
     old_dgpr_tcnt = 0xffffffffffffffffll;
+    old_vgpr.resize(p->wfSize);
 
     pendingFetch = false;
     dropFetch = false;
     condRegState = new ConditionRegisterState();
     maxSpVgprs = 0;
     maxDpVgprs = 0;
+    last_addr.resize(p->wfSize);
+    workitemFlatId.resize(p->wfSize);
+    old_dgpr.resize(p->wfSize);
+    bar_cnt.resize(p->wfSize);
+    remoteAddrList.resize(p->wfSize);
+    for (int i = 0; i < 3; ++i) {
+        workitemid[i].resize(p->wfSize);
+    }
 }
 
 void
@@ -142,6 +150,7 @@
 {
     if (callArgMem)
         delete callArgMem;
+    delete condRegState;
 }
 
 void
# Node ID 3a8eb18b522f6b3f93e13610bc1c9c1fa3d9b11a
# Parent  bda2c39fd9fdcedbe7a4b1df38f9ac1279208eee
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -250,8 +250,10 @@
     vrfs = []
     for j in xrange(options.simds_per_cu):
         for k in xrange(shader.n_wf):
-            wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
+            wavefronts.append(Wavefront(simdId = j, wf_slot_id = k, \
+                                        wfSize = options.wf_size))
         vrfs.append(VectorRegisterFile(simd_id=j,
+                                       wfSize=options.wf_size,
                               num_regs_per_simd=options.vreg_file_size))
     compute_units[-1].wavefronts = wavefronts
     compute_units[-1].vector_register_file = vrfs
diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py
--- a/src/arch/hsail/gen.py
+++ b/src/arch/hsail/gen.py
@@ -235,7 +235,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             DestCType dest_val = $expr;
             this->dest.set(w, lane, dest_val);
@@ -256,7 +256,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
             DestCType dest_val = $expr;
@@ -277,7 +277,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             CType dest_val;
             if ($dest_is_src_flag) {
@@ -312,7 +312,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             CType dest_val;
 
@@ -346,7 +346,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             DestT dest_val;
             if ($dest_is_src_flag) {
@@ -372,7 +372,7 @@
     Wavefront *w = gpuDynInst->wavefront();
 
     const VectorMask &mask = w->get_pred();
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             CType dest_val;
 
@@ -401,7 +401,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             DestCType dest_val;
             SrcCType src_val[$num_srcs];
diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh
--- a/src/arch/hsail/insts/branch.hh
+++ b/src/arch/hsail/insts/branch.hh
@@ -279,7 +279,7 @@
         // taken branch
         const uint32_t true_pc = getTargetPc();
         VectorMask true_mask;
-        for (unsigned int lane = 0; lane < VSZ; ++lane) {
+        for (unsigned int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
         }
 
diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc
--- a/src/arch/hsail/insts/main.cc
+++ b/src/arch/hsail/insts/main.cc
@@ -134,7 +134,7 @@
         const VectorMask &mask = w->get_pred();
 
         // mask off completed work-items
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 w->init_mask[lane] = 0;
             }
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh
--- a/src/arch/hsail/insts/mem.hh
+++ b/src/arch/hsail/insts/mem.hh
@@ -454,7 +454,7 @@
             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 
             if (num_dest_operands > 1) {
-                for (int i = 0; i < VSZ; ++i)
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
                     if (gpuDynInst->exec_mask[i])
                         gpuDynInst->statusVector.push_back(num_dest_operands);
                     else
@@ -463,9 +463,10 @@
 
             for (int k = 0; k < num_dest_operands; ++k) {
 
-                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+                c0 *d = &((c0*)gpuDynInst->d_data)
+                    [k * gpuDynInst->computeUnit()->wfSize()];
 
-                for (int i = 0; i < VSZ; ++i) {
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                     if (gpuDynInst->exec_mask[i]) {
                         Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
 
@@ -1001,7 +1002,7 @@
             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 
             if (num_src_operands > 1) {
-                for (int i = 0; i < VSZ; ++i)
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
                     if (gpuDynInst->exec_mask[i])
                         gpuDynInst->statusVector.push_back(num_src_operands);
                     else
@@ -1009,9 +1010,10 @@
             }
 
             for (int k = 0; k < num_src_operands; ++k) {
-                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+                c0 *d = &((c0*)gpuDynInst->d_data)
+                    [k * gpuDynInst->computeUnit()->wfSize()];
 
-                for (int i = 0; i < VSZ; ++i) {
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                     if (gpuDynInst->exec_mask[i]) {
                         Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
 
@@ -1399,7 +1401,7 @@
             c0 *e = &((c0*) gpuDynInst->a_data)[0];
             c0 *f = &((c0*) gpuDynInst->x_data)[0];
 
-            for (int i = 0; i < VSZ; ++i) {
+            for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                 if (gpuDynInst->exec_mask[i]) {
                     Addr vaddr = gpuDynInst->addr[i];
 
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh
--- a/src/arch/hsail/insts/mem_impl.hh
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -60,14 +60,16 @@
 
         typedef typename DestDataType::CType CType M5_VAR_USED;
         const VectorMask &mask = w->get_pred();
-        uint64_t addr_vec[VSZ];
+        std::vector<Addr> addr_vec;
+        addr_vec.resize(w->computeUnit->wfSize(), (Addr)0);
         this->addr.calcVector(w, addr_vec);
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 this->dest.set(w, lane, addr_vec[lane]);
             }
         }
+        addr_vec.clear();
     }
 
     template<typename MemDataType, typename DestDataType,
@@ -121,8 +123,8 @@
             i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
         assert(se);
 
-        return w->wfSlotId * w->privSizePerItem * VSZ +
-            se->offset * VSZ +
+        return w->wfSlotId * w->privSizePerItem * w->computeUnit->wfSize() +
+            se->offset * w->computeUnit->wfSize() +
             lane * se->size;
         */
 
@@ -139,9 +141,11 @@
         Addr addr_div8 = addr / 8;
         Addr addr_mod8 = addr % 8;
 
-        Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+        Addr ret = addr_div8 * 8 * w->computeUnit->wfSize() + lane * 8 +
+            addr_mod8 + w->privBase;
 
-        assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+        assert(ret < w->privBase +
+               (w->privSizePerItem * w->computeUnit->wfSize()));
 
         return ret;
     }
@@ -175,7 +179,7 @@
 
             DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
 
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     this->dest.set(w, lane, val);
                 }
@@ -184,7 +188,7 @@
             return;
         } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
             uint64_t address = this->addr.calcUniform();
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     MemCType val = w->readCallArgMem<MemCType>(lane, address);
 
@@ -239,7 +243,7 @@
             // this is a complete hack to get around a compiler bug
             // (the compiler currently generates global access for private
             //  addresses (starting from 0). We need to add the private offset)
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (m->addr[lane] < w->privSizePerItem) {
                     if (mask[lane]) {
                         // what is the size of the object we are accessing?
@@ -267,7 +271,7 @@
             m->pipeId = GLBMEM_PIPE;
             m->latency.set(w->computeUnit->shader->ticks(1));
             {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                     //  note: this calculation will NOT WORK if the compiler
                     //  ever generates loads/stores to the same address with
                     //  different widths (e.g., a ld_u32 addr and a ld_u16 addr)
@@ -301,7 +305,7 @@
             m->pipeId = GLBMEM_PIPE;
             m->latency.set(w->computeUnit->shader->ticks(1));
 
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
                     m->addr[lane] += w->roBase;
@@ -318,7 +322,7 @@
             m->pipeId = GLBMEM_PIPE;
             m->latency.set(w->computeUnit->shader->ticks(1));
             {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                     if (mask[lane]) {
                         assert(m->addr[lane] < w->privSizePerItem);
 
@@ -360,7 +364,7 @@
         if (this->segment == Brig::BRIG_SEGMENT_ARG) {
             uint64_t address = this->addr.calcUniform();
 
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     CType data = this->src.template get<CType>(w, lane);
                     DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
@@ -378,7 +382,7 @@
         this->addr.calcVector(w, m->addr);
 
         if (num_src_operands == 1) {
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     ((CType*)m->d_data)[lane] =
                         this->src.template get<CType>(w, lane);
@@ -386,9 +390,10 @@
             }
         } else {
             for (int k= 0; k < num_src_operands; ++k) {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                     if (mask[lane]) {
-                        ((CType*)m->d_data)[k * VSZ + lane] =
+                        ((CType*)m->d_data)
+                            [k * w->computeUnit->wfSize() + lane] =
                             this->src_vect[k].template get<CType>(w, lane);
                     }
                 }
@@ -428,7 +433,7 @@
             // this is a complete hack to get around a compiler bug
             // (the compiler currently generates global access for private
             //  addresses (starting from 0). We need to add the private offset)
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     if (m->addr[lane] < w->privSizePerItem) {
 
@@ -454,7 +459,7 @@
             m->pipeId = GLBMEM_PIPE;
             m->latency.set(w->computeUnit->shader->ticks(1));
             {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                     if (mask[lane]) {
                         assert(m->addr[lane] < w->spillSizePerItem);
 
@@ -483,7 +488,7 @@
             m->pipeId = GLBMEM_PIPE;
             m->latency.set(w->computeUnit->shader->ticks(1));
             {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                     if (mask[lane]) {
                         assert(m->addr[lane] < w->privSizePerItem);
                         m->addr[lane] = m->addr[lane] + lane *
@@ -558,14 +563,14 @@
 
         this->addr.calcVector(w, m->addr);
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             ((CType *)m->a_data)[lane] =
                 this->src[0].template get<CType>(w, lane);
         }
 
         // load second source operand for CAS
         if (NumSrcOperands > 1) {
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 ((CType*)m->x_data)[lane] =
                     this->src[1].template get<CType>(w, lane);
             }
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc
--- a/src/arch/hsail/insts/pseudo_inst.cc
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -84,7 +84,7 @@
         int op = 0;
         bool got_op = false;
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val0 = src1.get<int>(w, lane, 0);
                 if (got_op) {
@@ -182,7 +182,7 @@
     {
     #if TRACING_ON
         const VectorMask &mask = w->get_pred();
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
                 int src_val2 = src1.get<int>(w, lane, 2);
@@ -205,7 +205,7 @@
     {
     #if TRACING_ON
         const VectorMask &mask = w->get_pred();
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
                 int src_val2 = src1.get<int>(w, lane, 2);
@@ -231,7 +231,7 @@
         std::string res_str;
         res_str = csprintf("krl_prt (%s)\n", disassemble());
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (!(lane & 7)) {
                 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
             }
@@ -270,7 +270,7 @@
         int src_val3 = -1;
         res_str = csprintf("krl_prt (%s)\n", disassemble());
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (!(lane & 7)) {
                 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
             }
@@ -311,7 +311,7 @@
         std::string res_str;
         res_str = csprintf("krl_prt (%s)\n", disassemble());
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (!(lane & 3)) {
                 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
             }
@@ -350,7 +350,7 @@
         int src_val3 = -1;
         res_str = csprintf("krl_prt (%s)\n", disassemble());
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (!(lane & 3)) {
                 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
             }
@@ -391,7 +391,7 @@
         std::string res_str;
         res_str = csprintf("krl_prt (%s)\n", disassemble());
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (!(lane & 7)) {
                 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
             }
@@ -430,7 +430,7 @@
         res_str += csprintf("  Executing on CU #%i\n", w->computeUnit->cu_id);
         res_str += csprintf("  Exec mask: ");
 
-        for (int i = VSZ - 1; i >= 0; --i) {
+        for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) {
             if (w->execMask(i))
                 res_str += "1";
             else
@@ -458,7 +458,7 @@
         const VectorMask &mask = w->get_pred();
         int res = 0;
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
                 dest.set<int>(w, lane, res);
@@ -477,14 +477,14 @@
         const VectorMask &mask = w->get_pred();
         int res = 0;
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
                 res += src_val1;
             }
         }
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 dest.set<int>(w, lane, res);
             }
@@ -497,19 +497,19 @@
         const VectorMask &mask = w->get_pred();
         int res = 0;
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
 
                 if (src_val1) {
-                    if (lane < (VSZ/2)) {
+                    if (lane < (w->computeUnit->wfSize()/2)) {
                         res = res | ((uint32_t)(1) << lane);
                     }
                 }
             }
         }
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 dest.set<int>(w, lane, res);
             }
@@ -521,19 +521,20 @@
     {
         const VectorMask &mask = w->get_pred();
         int res = 0;
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
 
                 if (src_val1) {
-                    if (lane >= (VSZ/2)) {
-                        res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+                    if (lane >= (w->computeUnit->wfSize()/2)) {
+                        res = res | ((uint32_t)(1) <<
+                                     (lane - (w->computeUnit->wfSize()/2)));
                     }
                 }
             }
         }
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 dest.set<int>(w, lane, res);
             }
@@ -546,7 +547,7 @@
         const VectorMask &mask = w->get_pred();
         int max_cnt = 0;
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 w->bar_cnt[lane]++;
 
@@ -567,7 +568,7 @@
         const VectorMask &mask = w->get_pred();
         int max_cnt = 0;
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 w->bar_cnt[lane]--;
             }
@@ -592,7 +593,7 @@
     {
         const VectorMask &mask = w->get_pred();
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
                 panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
@@ -605,7 +606,7 @@
     Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
     {
         // the address is in src1 | src2
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             int src_val1 = src1.get<int>(w, lane, 1);
             int src_val2 = src1.get<int>(w, lane, 2);
             Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
@@ -622,7 +623,7 @@
 
         calcAddr(w, m);
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
         }
 
@@ -661,7 +662,7 @@
         GPUDynInstPtr m = gpuDynInst;
         calcAddr(w, m);
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
         }
 
@@ -736,7 +737,7 @@
         const VectorMask &mask = w->get_pred();
         int src_val1 = 0;
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 src_val1 = src1.get<int>(w, lane, 1);
                 break;
@@ -758,7 +759,7 @@
         const VectorMask &mask = w->get_pred();
         unsigned mst = true;
 
-        for (int lane = VSZ - 1; lane >= 0; --lane) {
+        for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
             if (mask[lane]) {
                 dest.set<int>(w, lane, mst);
                 mst = false;
@@ -773,7 +774,7 @@
         int res = 0;
         bool got_res = false;
 
-        for (int lane = VSZ - 1; lane >= 0; --lane) {
+        for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
             if (mask[lane]) {
                 if (!got_res) {
                     res = src1.get<int>(w, lane, 1);
diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh
--- a/src/arch/hsail/operand.hh
+++ b/src/arch/hsail/operand.hh
@@ -43,6 +43,7 @@
  */
 
 #include <string>
+#include <limits>
 
 #include "arch/hsail/Brig.h"
 #include "base/trace.hh"
@@ -346,6 +347,8 @@
 template<typename T>
 class ImmOperand : public BaseOperand
 {
+  private:
+    uint16_t kind;
   public:
     T bits;
 
@@ -355,11 +358,21 @@
 
     template<typename OperandType>
     OperandType
-    get()
+    get(Wavefront *w)
     {
         assert(sizeof(OperandType) <= sizeof(T));
+        panic_if(w == nullptr, "WF pointer needs to be set");
 
-        return *(OperandType*)&bits;
+        switch (kind) {
+          // immediate operand is WF size
+          case Brig::BRIG_KIND_OPERAND_WAVESIZE:
+            return (OperandType)w->computeUnit->wfSize();
+            break;
+
+          default:
+            return *(OperandType*)&bits;
+            break;
+        }
     }
 
     // This version of get() takes a WF* and a lane id for
@@ -368,7 +381,7 @@
     OperandType
     get(Wavefront *w, int lane)
     {
-        return get<OperandType>();
+        return get<OperandType>(w);
     }
 };
 
@@ -388,16 +401,18 @@
             auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
 
             bits = *((T*)(obj->getData(cbptr->bytes + 4)));
-
+            kind = brigOp->kind;
             return true;
         }
         break;
 
       case Brig::BRIG_KIND_OPERAND_WAVESIZE:
-        bits = VSZ;
+        kind = brigOp->kind;
+        bits = std::numeric_limits<unsigned long long>::digits;
         return true;
 
       default:
+        kind = Brig::BRIG_KIND_NONE;
         return false;
     }
 }
@@ -409,6 +424,7 @@
     const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
 
     if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+        kind = Brig::BRIG_KIND_NONE;
         return false;
     }
 
@@ -423,6 +439,7 @@
         (const Brig::BrigOperand *)obj->getOperand(*data_offset);
 
     if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+        kind = Brig::BRIG_KIND_NONE;
         return false;
     }
 
@@ -456,7 +473,7 @@
     OperandType
     get(Wavefront *w, int lane)
     {
-        return is_imm ?  imm_op.template get<OperandType>() :
+        return is_imm ?  imm_op.template get<OperandType>(w) :
                          reg_op.template get<OperandType>(w, lane);
     }
 
@@ -571,7 +588,7 @@
     uint64_t calcUniformBase();
 
   public:
-    virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
+    virtual void calcVector(Wavefront *w, std::vector<Addr> &addrVec) = 0;
     virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
 
     uint64_t offset;
@@ -586,7 +603,7 @@
     RegOperandType reg;
     void init(unsigned opOffset, const BrigObject *obj);
     uint64_t calcUniform();
-    void calcVector(Wavefront *w, uint64_t *addrVec);
+    void calcVector(Wavefront *w, std::vector<Addr> &addrVec);
     uint64_t calcLane(Wavefront *w, int lane=0);
     uint32_t opSize() { return reg.opSize(); }
     bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
@@ -641,11 +658,12 @@
 
 template<typename RegOperandType>
 void
-RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
+RegAddrOperand<RegOperandType>::calcVector(Wavefront *w,
+                                           std::vector<Addr> &addrVec)
 {
     Addr address = calcUniformBase();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (w->execMask(lane)) {
             if (reg.regFileChar == 's') {
                 addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
@@ -680,7 +698,7 @@
   public:
     void init(unsigned opOffset, const BrigObject *obj);
     uint64_t calcUniform();
-    void calcVector(Wavefront *w, uint64_t *addrVec);
+    void calcVector(Wavefront *w, std::vector<Addr> &addrVec);
     uint64_t calcLane(Wavefront *w, int lane=0);
     std::string disassemble();
 };
@@ -698,11 +716,11 @@
 }
 
 inline void
-NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
+NoRegAddrOperand::calcVector(Wavefront *w, std::vector<Addr> &addrVec)
 {
     uint64_t address = calcUniformBase();
 
-    for (int lane = 0; lane < VSZ; ++lane)
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane)
         addrVec[lane] = address;
 }
 
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -60,6 +60,7 @@
     simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
     num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
     min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
 
 class Wavefront(SimObject):
     type = 'Wavefront'
@@ -68,6 +69,7 @@
 
     simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
     wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
 
 class ComputeUnit(MemObject):
     type = 'ComputeUnit'
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc
--- a/src/gpu-compute/cl_driver.cc
+++ b/src/gpu-compute/cl_driver.cc
@@ -238,7 +238,7 @@
       case HSA_GET_VSZ:
         {
             BufferArg buf(buf_addr, sizeof(uint32_t));
-            *((uint32_t*)buf.bufferPtr()) = VSZ;
+            *((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize();
             buf.copyOut(tc->getMemProxy());
         }
         break;
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -161,22 +161,8 @@
     // if fixed-stride prefetching, this is the stride.
     int prefetchStride;
 
-    class LastVaddrWave
-    {
-      public:
-        Addr vaddrs[VSZ];
-        Addr& operator[](int idx) {
-            return vaddrs[idx];
-        }
-
-        LastVaddrWave() {
-            for (int i = 0; i < VSZ; ++i)
-                vaddrs[i] = 0;
-        }
-    };
-
-    LastVaddrWave lastVaddrCU;
-    std::vector<LastVaddrWave> lastVaddrPhase;
+    std::vector<Addr> lastVaddrCU;
+    std::vector<std::vector<Addr>> lastVaddrSimd;
     std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
     Enums::PrefetchType prefetchType;
     EXEC_POLICY exec_policy;
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -32,8 +32,9 @@
  *
  * Author: John Kalamatianos, Anthony Gutierrez
  */
+#include "gpu-compute/compute_unit.hh"
 
-#include "gpu-compute/compute_unit.hh"
+#include <limits>
 
 #include "base/output.hh"
 #include "debug/GPUDisp.hh"
@@ -76,14 +77,27 @@
     _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
     lds(*p->localDataStore), globalSeqNum(0),  wavefrontSize(p->wfSize)
 {
-    // this check will be eliminated once we have wavefront size support added
-    fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+    /*
+      This check is necessary because std::bitset only provides conversion to
+      unsigned long or unsigned long long via to_ulong() or to_ullong().
+      There are a few places in the code where to_ullong() is used, however if
+      the WF size is larger than a value the host can support then bitset will
+      throw a runtime exception. We should remove all use of to_long() or
+      to_ullong() conversions so we can set WF size
+      greater than 64, however until that is done this assert is required.
+    */
+    fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
+             p->wfSize <= 0,
+             "WF size is larger than the host can support");
+    fatal_if(!isPowerOf2(wavefrontSize),
+             "Wavefront size should be a power of 2");
     // calculate how many cycles a vector load or store will need to transfer
     // its data over the corresponding buses
-    numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
+    numCyclesPerStoreTransfer = (uint32_t)ceil((double)
+                                               (wfSize() * sizeof(uint32_t))
                                 / (double)vrfToCoalescerBusWidth);
 
-    numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+    numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
                                / coalescerToVrfBusWidth;
 
     lastVaddrWF.resize(numSIMDs);
@@ -93,24 +107,24 @@
         lastVaddrWF[j].resize(p->n_wf);
 
         for (int i = 0; i < p->n_wf; ++i) {
-            lastVaddrWF[j][i].resize(VSZ);
+            lastVaddrWF[j][i].resize(wfSize());
 
             wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
             wfList[j][i]->setParent(this);
 
-            for (int k = 0; k < VSZ; ++k) {
+            for (int k = 0; k < wfSize(); ++k) {
                 lastVaddrWF[j][i][k] = 0;
             }
         }
     }
 
-    lastVaddrPhase.resize(numSIMDs);
+    lastVaddrSimd.resize(numSIMDs);
 
     for (int i = 0; i < numSIMDs; ++i) {
-        lastVaddrPhase[i] = LastVaddrWave();
+        lastVaddrSimd[i].resize(wfSize(), 0);
     }
 
-    lastVaddrCU = LastVaddrWave();
+    lastVaddrCU.resize(wfSize());
 
     lds.setParent(this);
 
@@ -122,10 +136,10 @@
         fatal("Invalid WF execution policy (CU)\n");
     }
 
-    memPort.resize(VSZ);
+    memPort.resize(wfSize());
 
     // resize the tlbPort vectorArray
-    int tlbPort_width = perLaneTLB ? VSZ : 1;
+    int tlbPort_width = perLaneTLB ? wfSize() : 1;
     tlbPort.resize(tlbPort_width);
 
     cuExitCallback = new CUExitCallback(this);
@@ -144,12 +158,13 @@
 ComputeUnit::~ComputeUnit()
 {
     // Delete wavefront slots
-
-    for (int j = 0; j < numSIMDs; ++j)
+    for (int j = 0; j < numSIMDs; ++j) {
         for (int i = 0; i < shader->n_wf; ++i) {
             delete wfList[j][i];
         }
-
+        lastVaddrSimd[j].clear();
+    }
+    lastVaddrCU.clear();
     readyList.clear();
     waveStatusList.clear();
     dispatchList.clear();
@@ -187,27 +202,25 @@
     VectorMask init_mask;
     init_mask.reset();
 
-    for (int k = 0; k < VSZ; ++k) {
-        if (k + cnt * VSZ < trueWgSizeTotal)
+    for (int k = 0; k < wfSize(); ++k) {
+        if (k + cnt * wfSize() < trueWgSizeTotal)
             init_mask[k] = 1;
     }
 
     wfCtx->init_mask = init_mask.to_ullong();
     wfCtx->exec_mask = init_mask.to_ullong();
 
-    for (int i = 0; i < VSZ; ++i) {
-        wfCtx->bar_cnt[i] = 0;
-    }
+    wfCtx->bar_cnt.resize(wfSize(), 0);
 
     wfCtx->max_bar_cnt = 0;
     wfCtx->old_barrier_cnt = 0;
     wfCtx->barrier_cnt = 0;
 
     wfCtx->privBase = ndr->q.privMemStart;
-    ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
 
     wfCtx->spillBase = ndr->q.spillMemStart;
-    ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
 
     wfCtx->pc = 0;
     wfCtx->rpc = UINT32_MAX;
@@ -265,10 +278,12 @@
     w->dynwaveid = cnt;
     w->init_mask = wfCtx->init_mask;
 
-    for (int k = 0; k < VSZ; ++k) {
-        w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
-        w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
-        w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+    for (int k = 0; k < wfSize(); ++k) {
+        w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
+        w->workitemid[1][k] = ((k + cnt * wfSize()) / trueWgSize[0])
+            % trueWgSize[1];
+        w->workitemid[2][k] = (k + cnt * wfSize()) /
+            (trueWgSize[0] * trueWgSize[1]);
 
         w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
             trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
@@ -277,9 +292,9 @@
 
     w->old_barrier_cnt = wfCtx->old_barrier_cnt;
     w->barrier_cnt = wfCtx->barrier_cnt;
-    w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+    w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
 
-    for (int i = 0; i < VSZ; ++i) {
+    for (int i = 0; i < wfSize(); ++i) {
         w->bar_cnt[i] = wfCtx->bar_cnt[i];
     }
 
@@ -315,16 +330,17 @@
     // is this the last wavefront in the workgroup
     // if set the spillWidth to be the remaining work-items
     // so that the vector access is correct
-    if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
-        w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+    if ((cnt + 1) * wfSize() >= trueWgSizeTotal) {
+        w->spillWidth = trueWgSizeTotal - (cnt * wfSize());
     } else {
-        w->spillWidth = VSZ;
+        w->spillWidth = wfSize();
     }
 
     DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
             "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
 
     w->start(++_n_wave, ndr->q.code_ptr);
+    wfCtx->bar_cnt.clear();
 }
 
 void
@@ -339,7 +355,7 @@
     // Send L1 cache acquire
     // isKernel + isAcquire = Kernel Begin
     if (shader->impl_kern_boundary_sync) {
-        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this,
                                                                 nullptr,
                                                                 nullptr, 0);
 
@@ -374,7 +390,7 @@
         if (w->status == Wavefront::S_STOPPED) {
             // if we have scheduled all work items then stop
             // scheduling wavefronts
-            if (cnt * VSZ >= trueWgSizeTotal)
+            if (cnt * wfSize() >= trueWgSizeTotal)
                 break;
 
             // reserve vector registers for the scheduled wavefront
@@ -420,7 +436,7 @@
     // work item of the work group
     int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
     bool vregAvail = true;
-    int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+    int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
     int freeWfSlots = 0;
     // check if the total number of VGPRs required by all WFs of the WG
     // fit in the VRFs of all SIMD units
@@ -623,7 +639,7 @@
     // Setup space for call args
     for (int j = 0; j < numSIMDs; ++j) {
         for (int i = 0; i < shader->n_wf; ++i) {
-            wfList[j][i]->initCallArgMem(shader->funcargs_size);
+            wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
         }
     }
 
@@ -1193,15 +1209,15 @@
         Addr last = 0;
 
         switch(computeUnit->prefetchType) {
-          case Enums::PF_CU:
+        case Enums::PF_CU:
             last = computeUnit->lastVaddrCU[mp_index];
             break;
-          case Enums::PF_PHASE:
-            last = computeUnit->lastVaddrPhase[simdId][mp_index];
+        case Enums::PF_PHASE:
+            last = computeUnit->lastVaddrSimd[simdId][mp_index];
             break;
-          case Enums::PF_WF:
+        case Enums::PF_WF:
             last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
-          default:
+        default:
             break;
         }
 
@@ -1215,7 +1231,7 @@
         DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
 
         computeUnit->lastVaddrCU[mp_index] = vaddr;
-        computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+        computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
         computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
 
         stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
@@ -1486,7 +1502,7 @@
         ;
 
     ldsBankConflictDist
-       .init(0, VSZ, 2)
+       .init(0, wfSize(), 2)
        .name(name() + ".lds_bank_conflicts")
        .desc("Number of bank conflicts per LDS memory packet")
        ;
@@ -1497,27 +1513,28 @@
         ;
 
     pageDivergenceDist
-       // A wavefront can touch 1 to VSZ pages per memory instruction.
-       // The number of pages per bin can be configured (here it's 4).
-       .init(1, VSZ, 4)
+        // A wavefront can touch up to N pages per memory instruction where
+        // N is equal to the wavefront size
+        // The number of pages per bin can be configured (here it's 4).
+       .init(1, wfSize(), 4)
        .name(name() + ".page_divergence_dist")
        .desc("pages touched per wf (over all mem. instr.)")
        ;
 
     controlFlowDivergenceDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
         .name(name() + ".warp_execution_dist")
         .desc("number of lanes active per instruction (oval all instructions)")
         ;
 
     activeLanesPerGMemInstrDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
         .name(name() + ".gmem_lanes_execution_dist")
         .desc("number of active lanes per global memory instruction")
         ;
 
     activeLanesPerLMemInstrDist
-        .init(1, VSZ, 4)
+        .init(1, wfSize(), 4)
         .name(name() + ".lmem_lanes_execution_dist")
         .desc("number of active lanes per local memory instruction")
         ;
@@ -1529,7 +1546,7 @@
 
     numVecOpsExecuted
         .name(name() + ".num_vec_ops_executed")
-        .desc("number of vec ops executed (e.g. VSZ/inst)")
+        .desc("number of vec ops executed (e.g. WF size/inst)")
         ;
 
     totalCycles
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -157,6 +157,7 @@
 
         // helper functions to retrieve/set GPU attributes
         int getNumCUs();
+        int wfSize() const;
         void setFuncargsSize(int funcargs_size);
 };
 
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -387,6 +387,12 @@
     return shader->cuList.size();
 }
 
+int
+GpuDispatcher::wfSize() const
+{
+    return shader->cuList[0]->wfSize();
+}
+
 void
 GpuDispatcher::setFuncargsSize(int funcargs_size)
 {
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -179,9 +179,9 @@
                 int physVgpr = w->remap(dst, sizeof(c0), 1);
                 // save the physical VGPR index
                 regVec.push_back(physVgpr);
-                c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+                c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
 
-                for (int i = 0; i < VSZ; ++i) {
+                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                     if (m->exec_mask[i]) {
                         DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
                                 "$%s%d <- %d global ld done (src = wavefront "
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -205,7 +205,7 @@
   public:
     GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
                uint64_t instSeqNum);
-
+    ~GPUDynInst();
     void execute();
     int numSrcRegOperands();
     int numDstRegOperands();
@@ -226,15 +226,15 @@
     Enums::StorageClassType executedAs();
 
     // The address of the memory operation
-    Addr addr[VSZ];
+    std::vector<Addr> addr;
     Addr pAddr;
 
     // The data to get written
-    uint8_t d_data[VSZ * 16];
+    uint8_t *d_data;
     // Additional data (for atomics)
-    uint8_t a_data[VSZ * 8];
+    uint8_t *a_data;
     // Additional data (for atomics)
-    uint8_t x_data[VSZ * 8];
+    uint8_t *x_data;
     // The execution mask
     VectorMask exec_mask;
 
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -42,11 +42,29 @@
 
 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
                        GPUStaticInst *_staticInst, uint64_t instSeqNum)
-    : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+    : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
+      m_op(Enums::MO_UNDEF),
       memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false),
       statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
 {
-    tlbHitLevel.assign(VSZ, -1);
+    tlbHitLevel.assign(computeUnit()->wfSize(), -1);
+    d_data = new uint8_t[computeUnit()->wfSize() * 16];
+    a_data = new uint8_t[computeUnit()->wfSize() * 8];
+    x_data = new uint8_t[computeUnit()->wfSize() * 8];
+    for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
+        a_data[i] = 0;
+        x_data[i] = 0;
+    }
+    for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
+        d_data[i] = 0;
+    }
+}
+
+GPUDynInst::~GPUDynInst()
+{
+    delete[] d_data;
+    delete[] a_data;
+    delete[] x_data;
 }
 
 void
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -148,9 +148,9 @@
             int physVgpr = w->remap(dst,sizeof(c0),1);
             // save the physical VGPR index
             regVec.push_back(physVgpr);
-            c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+            c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
 
-            for (int i = 0; i < VSZ; ++i) {
+            for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                 if (m->exec_mask[i]) {
                     // write the value into the physical VGPR. This is a purely
                     // functional operation. No timing is modeled.
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
--- a/src/gpu-compute/misc.hh
+++ b/src/gpu-compute/misc.hh
@@ -38,27 +38,13 @@
 
 #include <bitset>
 #include <memory>
+#include <limits>
 
 #include "base/misc.hh"
 
 class GPUDynInst;
 
-// wavefront size of the machine
-static const int VSZ = 64;
-
-/*
- This check is necessary because std::bitset only provides conversion to
- unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
- a few places in the code where to_ullong() is used, however if VSZ is larger
- than a value the host can support then bitset will throw a runtime exception.
-
- we should remove all use of to_long() or to_ullong() so we can have VSZ
- greater than 64b, however until that is done this assert is required.
- */
-static_assert(VSZ <= sizeof(unsigned long long) * 8,
-              "VSZ is larger than the host can support");
-
-typedef std::bitset<VSZ> VectorMask;
+typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
 typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
 
 class WaitClass
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@@ -100,7 +100,7 @@
 {
     // 32 bit values
     // barrier state
-    int bar_cnt[VSZ];
+    std::vector<int> bar_cnt;
 
     // id (which WF in the WG)
     int cnt;
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -63,7 +63,7 @@
     nxtBusy.clear();
     nxtBusy.resize(numRegsPerSimd, 0);
 
-    vgprState->init(numRegsPerSimd);
+    vgprState->init(numRegsPerSimd, p->wfSize);
 }
 
 void
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh
--- a/src/gpu-compute/vector_register_state.hh
+++ b/src/gpu-compute/vector_register_state.hh
@@ -51,7 +51,7 @@
 {
   public:
     VecRegisterState();
-    void init(uint32_t _size);
+    void init(uint32_t _size, uint32_t wf_size);
 
     const std::string& name() const { return _name; }
     void setParent(ComputeUnit *_computeUnit);
@@ -93,9 +93,9 @@
     ComputeUnit *computeUnit;
     std::string _name;
     // 32-bit Single Precision Vector Register State
-    std::vector<std::array<uint32_t, VSZ>> s_reg;
+    std::vector<std::vector<uint32_t>> s_reg;
     // 64-bit Double Precision Vector Register State
-    std::vector<std::array<uint64_t, VSZ>> d_reg;
+    std::vector<std::vector<uint64_t>> d_reg;
 };
 
 #endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc
--- a/src/gpu-compute/vector_register_state.cc
+++ b/src/gpu-compute/vector_register_state.cc
@@ -35,6 +35,8 @@
 
 #include "gpu-compute/vector_register_state.hh"
 
+#include <limits>
+
 #include "gpu-compute/compute_unit.hh"
 
 VecRegisterState::VecRegisterState() : computeUnit(nullptr)
@@ -51,8 +53,19 @@
 }
 
 void
-VecRegisterState::init(uint32_t _size)
+VecRegisterState::init(uint32_t _size, uint32_t wf_size)
 {
     s_reg.resize(_size);
+    fatal_if(wf_size > std::numeric_limits<unsigned long long>::digits ||
+             wf_size <= 0,
+             "WF size is larger than the host can support or is zero");
+    fatal_if((wf_size & (wf_size - 1)) != 0,
+             "Wavefront size should be a power of 2");
+    for (int i = 0; i < s_reg.size(); ++i) {
+        s_reg[i].resize(wf_size, 0);
+    }
     d_reg.resize(_size);
+    for (int i = 0; i < d_reg.size(); ++i) {
+        d_reg[i].resize(wf_size, 0);
+    }
 }