diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -83,6 +83,7 @@ public: // pointer to buffer for storing function arguments uint8_t *mem; + int wfSize; // size of function args int funcArgsSizePerItem; @@ -90,13 +91,13 @@ int getLaneOffset(int lane, int addr) { - return addr * VSZ + sizeof(CType) * lane; + return addr * wfSize + sizeof(CType) * lane; } - CallArgMem(int func_args_size_per_item) - : funcArgsSizePerItem(func_args_size_per_item) + CallArgMem(int func_args_size_per_item, int wf_size) + : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) { - mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ); + mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); } ~CallArgMem() @@ -192,9 +193,9 @@ bool isOldestInstALU(); bool isOldestInstBarrier(); // used for passing spill address to DDInstGPU - uint64_t last_addr[VSZ]; - uint32_t workitemid[3][VSZ]; - uint32_t workitemFlatId[VSZ]; + std::vector last_addr; + std::vector workitemid[3]; + std::vector workitemFlatId; uint32_t workgroupid[3]; uint32_t workgroupsz[3]; uint32_t gridsz[3]; @@ -230,14 +231,14 @@ uint32_t startVgprIndex; // Old value of destination gpr (for trace) - uint32_t old_vgpr[VSZ]; + std::vector old_vgpr; // Id of destination gpr (for trace) uint32_t old_vgpr_id; // Tick count of last old_vgpr copy uint64_t old_vgpr_tcnt; // Old value of destination gpr (for trace) - uint64_t old_dgpr[VSZ]; + std::vector old_dgpr; // Id of destination gpr (for trace) uint32_t old_dgpr_id; // Tick count of last old_vgpr copy @@ -247,7 +248,7 @@ VectorMask init_mask; // number of barriers this WF has joined - int bar_cnt[VSZ]; + std::vector bar_cnt; int max_bar_cnt; // Flag to stall a wave on barrier bool stalledAtBarrier; @@ -296,9 +297,9 @@ // argument memory for hsail call instruction CallArgMem *callArgMem; void - initCallArgMem(int func_args_size_per_item) + initCallArgMem(int func_args_size_per_item, int wf_size) { - callArgMem = new CallArgMem(func_args_size_per_item); + callArgMem = new CallArgMem(func_args_size_per_item, wf_size); } template @@ -327,7 +328,6 @@ } void start(uint64_t _wfDynId, uint64_t _base_ptr); - void exec(); void updateResources(); int ready(itype_e type); diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -55,7 +55,6 @@ last_trace = 0; simdId = p->simdId; wfSlotId = p->wf_slot_id; - status = S_STOPPED; reservedVectorRegs = 0; startVgprIndex = 0; @@ -77,12 +76,21 @@ mem_trace_busy = 0; old_vgpr_tcnt = 0xffffffffffffffffll; old_dgpr_tcnt = 0xffffffffffffffffll; + old_vgpr.resize(p->wfSize); pendingFetch = false; dropFetch = false; condRegState = new ConditionRegisterState(); maxSpVgprs = 0; maxDpVgprs = 0; + last_addr.resize(p->wfSize); + workitemFlatId.resize(p->wfSize); + old_dgpr.resize(p->wfSize); + bar_cnt.resize(p->wfSize); + remoteAddrList.resize(p->wfSize); + for (int i = 0; i < 3; ++i) { + workitemid[i].resize(p->wfSize); + } } void @@ -142,6 +150,7 @@ { if (callArgMem) delete callArgMem; + delete condRegState; } void # Node ID 3a8eb18b522f6b3f93e13610bc1c9c1fa3d9b11a # Parent bda2c39fd9fdcedbe7a4b1df38f9ac1279208eee diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -250,8 +250,10 @@ vrfs = [] for j in xrange(options.simds_per_cu): for k in xrange(shader.n_wf): - wavefronts.append(Wavefront(simdId = j, wf_slot_id = k)) + wavefronts.append(Wavefront(simdId = j, wf_slot_id = k, \ + wfSize = options.wf_size)) vrfs.append(VectorRegisterFile(simd_id=j, + wfSize=options.wf_size, num_regs_per_simd=options.vreg_file_size)) compute_units[-1].wavefronts = wavefronts compute_units[-1].vector_register_file = vrfs diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py --- a/src/arch/hsail/gen.py +++ b/src/arch/hsail/gen.py @@ -235,7 +235,7 @@ const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { DestCType dest_val = $expr; this->dest.set(w, lane, dest_val); @@ -256,7 +256,7 @@ const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { SrcCType src_val0 = this->src0.get(w, lane); DestCType dest_val = $expr; @@ -277,7 +277,7 @@ const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { CType dest_val; if ($dest_is_src_flag) { @@ -312,7 +312,7 @@ const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { CType dest_val; @@ -346,7 +346,7 @@ const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { DestT dest_val; if ($dest_is_src_flag) { @@ -372,7 +372,7 @@ Wavefront *w = gpuDynInst->wavefront(); const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { CType dest_val; @@ -401,7 +401,7 @@ const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { DestCType dest_val; SrcCType src_val[$num_srcs]; diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh --- a/src/arch/hsail/insts/branch.hh +++ b/src/arch/hsail/insts/branch.hh @@ -279,7 +279,7 @@ // taken branch const uint32_t true_pc = getTargetPc(); VectorMask true_mask; - for (unsigned int lane = 0; lane < VSZ; ++lane) { + for (unsigned int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { true_mask[lane] = cond.get(w, lane) & curr_mask[lane]; } diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc --- a/src/arch/hsail/insts/main.cc +++ b/src/arch/hsail/insts/main.cc @@ -134,7 +134,7 @@ const VectorMask &mask = w->get_pred(); // mask off completed work-items - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { w->init_mask[lane] = 0; } diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh --- a/src/arch/hsail/insts/mem.hh +++ b/src/arch/hsail/insts/mem.hh @@ -454,7 +454,7 @@ gpuDynInst->statusBitVector = gpuDynInst->exec_mask; if (num_dest_operands > 1) { - for (int i = 0; i < VSZ; ++i) + for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) if (gpuDynInst->exec_mask[i]) gpuDynInst->statusVector.push_back(num_dest_operands); else @@ -463,9 +463,10 @@ for (int k = 0; k < num_dest_operands; ++k) { - c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ]; + c0 *d = &((c0*)gpuDynInst->d_data) + [k * gpuDynInst->computeUnit()->wfSize()]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); @@ -1001,7 +1002,7 @@ gpuDynInst->statusBitVector = gpuDynInst->exec_mask; if (num_src_operands > 1) { - for (int i = 0; i < VSZ; ++i) + for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) if (gpuDynInst->exec_mask[i]) gpuDynInst->statusVector.push_back(num_src_operands); else @@ -1009,9 +1010,10 @@ } for (int k = 0; k < num_src_operands; ++k) { - c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ]; + c0 *d = &((c0*)gpuDynInst->d_data) + [k * gpuDynInst->computeUnit()->wfSize()]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); @@ -1399,7 +1401,7 @@ c0 *e = &((c0*) gpuDynInst->a_data)[0]; c0 *f = &((c0*) gpuDynInst->x_data)[0]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i]; diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh --- a/src/arch/hsail/insts/mem_impl.hh +++ b/src/arch/hsail/insts/mem_impl.hh @@ -60,14 +60,16 @@ typedef typename DestDataType::CType CType M5_VAR_USED; const VectorMask &mask = w->get_pred(); - uint64_t addr_vec[VSZ]; + std::vector addr_vec; + addr_vec.resize(w->computeUnit->wfSize(), (Addr)0); this->addr.calcVector(w, addr_vec); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { this->dest.set(w, lane, addr_vec[lane]); } } + addr_vec.clear(); } templateparent->findSymbol(Brig::BrigPrivateSpace, addr); assert(se); - return w->wfSlotId * w->privSizePerItem * VSZ + - se->offset * VSZ + + return w->wfSlotId * w->privSizePerItem * w->computeUnit->wfSize() + + se->offset * w->computeUnit->wfSize() + lane * se->size; */ @@ -139,9 +141,11 @@ Addr addr_div8 = addr / 8; Addr addr_mod8 = addr % 8; - Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase; + Addr ret = addr_div8 * 8 * w->computeUnit->wfSize() + lane * 8 + + addr_mod8 + w->privBase; - assert(ret < w->privBase + (w->privSizePerItem * VSZ)); + assert(ret < w->privBase + + (w->privSizePerItem * w->computeUnit->wfSize())); return ret; } @@ -175,7 +179,7 @@ DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { this->dest.set(w, lane, val); } @@ -184,7 +188,7 @@ return; } else if (this->segment == Brig::BRIG_SEGMENT_ARG) { uint64_t address = this->addr.calcUniform(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { MemCType val = w->readCallArgMem(lane, address); @@ -239,7 +243,7 @@ // this is a complete hack to get around a compiler bug // (the compiler currently generates global access for private // addresses (starting from 0). We need to add the private offset) - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (m->addr[lane] < w->privSizePerItem) { if (mask[lane]) { // what is the size of the object we are accessing? @@ -267,7 +271,7 @@ m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { // note: this calculation will NOT WORK if the compiler // ever generates loads/stores to the same address with // different widths (e.g., a ld_u32 addr and a ld_u16 addr) @@ -301,7 +305,7 @@ m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { assert(m->addr[lane] + sizeof(MemCType) <= w->roSize); m->addr[lane] += w->roBase; @@ -318,7 +322,7 @@ m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { assert(m->addr[lane] < w->privSizePerItem); @@ -360,7 +364,7 @@ if (this->segment == Brig::BRIG_SEGMENT_ARG) { uint64_t address = this->addr.calcUniform(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { CType data = this->src.template get(w, lane); DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data); @@ -378,7 +382,7 @@ this->addr.calcVector(w, m->addr); if (num_src_operands == 1) { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { ((CType*)m->d_data)[lane] = this->src.template get(w, lane); @@ -386,9 +390,10 @@ } } else { for (int k= 0; k < num_src_operands; ++k) { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { - ((CType*)m->d_data)[k * VSZ + lane] = + ((CType*)m->d_data) + [k * w->computeUnit->wfSize() + lane] = this->src_vect[k].template get(w, lane); } } @@ -428,7 +433,7 @@ // this is a complete hack to get around a compiler bug // (the compiler currently generates global access for private // addresses (starting from 0). We need to add the private offset) - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { if (m->addr[lane] < w->privSizePerItem) { @@ -454,7 +459,7 @@ m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { assert(m->addr[lane] < w->spillSizePerItem); @@ -483,7 +488,7 @@ m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { assert(m->addr[lane] < w->privSizePerItem); m->addr[lane] = m->addr[lane] + lane * @@ -558,14 +563,14 @@ this->addr.calcVector(w, m->addr); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { ((CType *)m->a_data)[lane] = this->src[0].template get(w, lane); } // load second source operand for CAS if (NumSrcOperands > 1) { - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { ((CType*)m->x_data)[lane] = this->src[1].template get(w, lane); } diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc --- a/src/arch/hsail/insts/pseudo_inst.cc +++ b/src/arch/hsail/insts/pseudo_inst.cc @@ -84,7 +84,7 @@ int op = 0; bool got_op = false; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val0 = src1.get(w, lane, 0); if (got_op) { @@ -182,7 +182,7 @@ { #if TRACING_ON const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get(w, lane, 1); int src_val2 = src1.get(w, lane, 2); @@ -205,7 +205,7 @@ { #if TRACING_ON const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int64_t src_val1 = src1.get(w, lane, 1); int src_val2 = src1.get(w, lane, 2); @@ -231,7 +231,7 @@ std::string res_str; res_str = csprintf("krl_prt (%s)\n", disassemble()); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (!(lane & 7)) { res_str += csprintf("DB%03d: ", (int)w->wfDynId); } @@ -270,7 +270,7 @@ int src_val3 = -1; res_str = csprintf("krl_prt (%s)\n", disassemble()); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (!(lane & 7)) { res_str += csprintf("DB%03d: ", (int)w->wfDynId); } @@ -311,7 +311,7 @@ std::string res_str; res_str = csprintf("krl_prt (%s)\n", disassemble()); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (!(lane & 3)) { res_str += csprintf("DB%03d: ", (int)w->wfDynId); } @@ -350,7 +350,7 @@ int src_val3 = -1; res_str = csprintf("krl_prt (%s)\n", disassemble()); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (!(lane & 3)) { res_str += csprintf("DB%03d: ", (int)w->wfDynId); } @@ -391,7 +391,7 @@ std::string res_str; res_str = csprintf("krl_prt (%s)\n", disassemble()); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (!(lane & 7)) { res_str += csprintf("DB%03d: ", (int)w->wfDynId); } @@ -430,7 +430,7 @@ res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id); res_str += csprintf(" Exec mask: "); - for (int i = VSZ - 1; i >= 0; --i) { + for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) { if (w->execMask(i)) res_str += "1"; else @@ -458,7 +458,7 @@ const VectorMask &mask = w->get_pred(); int res = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get(w, lane, 1); dest.set(w, lane, res); @@ -477,14 +477,14 @@ const VectorMask &mask = w->get_pred(); int res = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get(w, lane, 1); res += src_val1; } } - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { dest.set(w, lane, res); } @@ -497,19 +497,19 @@ const VectorMask &mask = w->get_pred(); int res = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get(w, lane, 1); if (src_val1) { - if (lane < (VSZ/2)) { + if (lane < (w->computeUnit->wfSize()/2)) { res = res | ((uint32_t)(1) << lane); } } } } - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { dest.set(w, lane, res); } @@ -521,19 +521,20 @@ { const VectorMask &mask = w->get_pred(); int res = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get(w, lane, 1); if (src_val1) { - if (lane >= (VSZ/2)) { - res = res | ((uint32_t)(1) << (lane - (VSZ/2))); + if (lane >= (w->computeUnit->wfSize()/2)) { + res = res | ((uint32_t)(1) << + (lane - (w->computeUnit->wfSize()/2))); } } } } - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { dest.set(w, lane, res); } @@ -546,7 +547,7 @@ const VectorMask &mask = w->get_pred(); int max_cnt = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { w->bar_cnt[lane]++; @@ -567,7 +568,7 @@ const VectorMask &mask = w->get_pred(); int max_cnt = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { w->bar_cnt[lane]--; } @@ -592,7 +593,7 @@ { const VectorMask &mask = w->get_pred(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { int src_val1 = src1.get(w, lane, 1); panic("OpenCL Code failed assertion #%d. Triggered by lane %s", @@ -605,7 +606,7 @@ Call::calcAddr(Wavefront *w, GPUDynInstPtr m) { // the address is in src1 | src2 - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { int src_val1 = src1.get(w, lane, 1); int src_val2 = src1.get(w, lane, 2); Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2); @@ -622,7 +623,7 @@ calcAddr(w, m); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { ((int*)m->a_data)[lane] = src1.get(w, lane, 3); } @@ -661,7 +662,7 @@ GPUDynInstPtr m = gpuDynInst; calcAddr(w, m); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { ((int*)m->a_data)[lane] = src1.get(w, lane, 1); } @@ -736,7 +737,7 @@ const VectorMask &mask = w->get_pred(); int src_val1 = 0; - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (mask[lane]) { src_val1 = src1.get(w, lane, 1); break; @@ -758,7 +759,7 @@ const VectorMask &mask = w->get_pred(); unsigned mst = true; - for (int lane = VSZ - 1; lane >= 0; --lane) { + for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) { if (mask[lane]) { dest.set(w, lane, mst); mst = false; @@ -773,7 +774,7 @@ int res = 0; bool got_res = false; - for (int lane = VSZ - 1; lane >= 0; --lane) { + for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) { if (mask[lane]) { if (!got_res) { res = src1.get(w, lane, 1); diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh --- a/src/arch/hsail/operand.hh +++ b/src/arch/hsail/operand.hh @@ -43,6 +43,7 @@ */ #include +#include #include "arch/hsail/Brig.h" #include "base/trace.hh" @@ -346,6 +347,8 @@ template class ImmOperand : public BaseOperand { + private: + uint16_t kind; public: T bits; @@ -355,11 +358,21 @@ template OperandType - get() + get(Wavefront *w) { assert(sizeof(OperandType) <= sizeof(T)); + panic_if(w == nullptr, "WF pointer needs to be set"); - return *(OperandType*)&bits; + switch (kind) { + // immediate operand is WF size + case Brig::BRIG_KIND_OPERAND_WAVESIZE: + return (OperandType)w->computeUnit->wfSize(); + break; + + default: + return *(OperandType*)&bits; + break; + } } // This version of get() takes a WF* and a lane id for @@ -368,7 +381,7 @@ OperandType get(Wavefront *w, int lane) { - return get(); + return get(w); } }; @@ -388,16 +401,18 @@ auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp; bits = *((T*)(obj->getData(cbptr->bytes + 4))); - + kind = brigOp->kind; return true; } break; case Brig::BRIG_KIND_OPERAND_WAVESIZE: - bits = VSZ; + kind = brigOp->kind; + bits = std::numeric_limits::digits; return true; default: + kind = Brig::BRIG_KIND_NONE; return false; } } @@ -409,6 +424,7 @@ const Brig::BrigOperand *brigOp = obj->getOperand(opOffset); if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + kind = Brig::BRIG_KIND_NONE; return false; } @@ -423,6 +439,7 @@ (const Brig::BrigOperand *)obj->getOperand(*data_offset); if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { + kind = Brig::BRIG_KIND_NONE; return false; } @@ -456,7 +473,7 @@ OperandType get(Wavefront *w, int lane) { - return is_imm ? imm_op.template get() : + return is_imm ? imm_op.template get(w) : reg_op.template get(w, lane); } @@ -571,7 +588,7 @@ uint64_t calcUniformBase(); public: - virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0; + virtual void calcVector(Wavefront *w, std::vector &addrVec) = 0; virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0; uint64_t offset; @@ -586,7 +603,7 @@ RegOperandType reg; void init(unsigned opOffset, const BrigObject *obj); uint64_t calcUniform(); - void calcVector(Wavefront *w, uint64_t *addrVec); + void calcVector(Wavefront *w, std::vector &addrVec); uint64_t calcLane(Wavefront *w, int lane=0); uint32_t opSize() { return reg.opSize(); } bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; } @@ -641,11 +658,12 @@ template void -RegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec) +RegAddrOperand::calcVector(Wavefront *w, + std::vector &addrVec) { Addr address = calcUniformBase(); - for (int lane = 0; lane < VSZ; ++lane) { + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { if (w->execMask(lane)) { if (reg.regFileChar == 's') { addrVec[lane] = address + reg.template get(w, lane); @@ -680,7 +698,7 @@ public: void init(unsigned opOffset, const BrigObject *obj); uint64_t calcUniform(); - void calcVector(Wavefront *w, uint64_t *addrVec); + void calcVector(Wavefront *w, std::vector &addrVec); uint64_t calcLane(Wavefront *w, int lane=0); std::string disassemble(); }; @@ -698,11 +716,11 @@ } inline void -NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec) +NoRegAddrOperand::calcVector(Wavefront *w, std::vector &addrVec) { uint64_t address = calcUniformBase(); - for (int lane = 0; lane < VSZ; ++lane) + for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) addrVec[lane] = address; } diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -60,6 +60,7 @@ simd_id = Param.Int(0, 'SIMD ID associated with this VRF') num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD') min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') + wfSize = Param.Int(64, 'Wavefront size (in work items)') class Wavefront(SimObject): type = 'Wavefront' @@ -68,6 +69,7 @@ simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)') wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)') + wfSize = Param.Int(64, 'Wavefront size (in work items)') class ComputeUnit(MemObject): type = 'ComputeUnit' diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc --- a/src/gpu-compute/cl_driver.cc +++ b/src/gpu-compute/cl_driver.cc @@ -238,7 +238,7 @@ case HSA_GET_VSZ: { BufferArg buf(buf_addr, sizeof(uint32_t)); - *((uint32_t*)buf.bufferPtr()) = VSZ; + *((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize(); buf.copyOut(tc->getMemProxy()); } break; diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -161,22 +161,8 @@ // if fixed-stride prefetching, this is the stride. int prefetchStride; - class LastVaddrWave - { - public: - Addr vaddrs[VSZ]; - Addr& operator[](int idx) { - return vaddrs[idx]; - } - - LastVaddrWave() { - for (int i = 0; i < VSZ; ++i) - vaddrs[i] = 0; - } - }; - - LastVaddrWave lastVaddrCU; - std::vector lastVaddrPhase; + std::vector lastVaddrCU; + std::vector> lastVaddrSimd; std::vector>> lastVaddrWF; Enums::PrefetchType prefetchType; EXEC_POLICY exec_policy; diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -32,8 +32,9 @@ * * Author: John Kalamatianos, Anthony Gutierrez */ +#include "gpu-compute/compute_unit.hh" -#include "gpu-compute/compute_unit.hh" +#include #include "base/output.hh" #include "debug/GPUDisp.hh" @@ -76,14 +77,27 @@ _masterId(p->system->getMasterId(name() + ".ComputeUnit")), lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize) { - // this check will be eliminated once we have wavefront size support added - fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ"); + /* + This check is necessary because std::bitset only provides conversion to + unsigned long or unsigned long long via to_ulong() or to_ullong(). + There are a few places in the code where to_ullong() is used, however if + the WF size is larger than a value the host can support then bitset will + throw a runtime exception. We should remove all use of to_long() or + to_ullong() conversions so we can set WF size + greater than 64, however until that is done this assert is required. + */ + fatal_if(p->wfSize > std::numeric_limits::digits || + p->wfSize <= 0, + "WF size is larger than the host can support"); + fatal_if(!isPowerOf2(wavefrontSize), + "Wavefront size should be a power of 2"); // calculate how many cycles a vector load or store will need to transfer // its data over the corresponding buses - numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t)) + numCyclesPerStoreTransfer = (uint32_t)ceil((double) + (wfSize() * sizeof(uint32_t)) / (double)vrfToCoalescerBusWidth); - numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t)) + numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t)) / coalescerToVrfBusWidth; lastVaddrWF.resize(numSIMDs); @@ -93,24 +107,24 @@ lastVaddrWF[j].resize(p->n_wf); for (int i = 0; i < p->n_wf; ++i) { - lastVaddrWF[j][i].resize(VSZ); + lastVaddrWF[j][i].resize(wfSize()); wfList[j].push_back(p->wavefronts[j * p->n_wf + i]); wfList[j][i]->setParent(this); - for (int k = 0; k < VSZ; ++k) { + for (int k = 0; k < wfSize(); ++k) { lastVaddrWF[j][i][k] = 0; } } } - lastVaddrPhase.resize(numSIMDs); + lastVaddrSimd.resize(numSIMDs); for (int i = 0; i < numSIMDs; ++i) { - lastVaddrPhase[i] = LastVaddrWave(); + lastVaddrSimd[i].resize(wfSize(), 0); } - lastVaddrCU = LastVaddrWave(); + lastVaddrCU.resize(wfSize()); lds.setParent(this); @@ -122,10 +136,10 @@ fatal("Invalid WF execution policy (CU)\n"); } - memPort.resize(VSZ); + memPort.resize(wfSize()); // resize the tlbPort vectorArray - int tlbPort_width = perLaneTLB ? VSZ : 1; + int tlbPort_width = perLaneTLB ? wfSize() : 1; tlbPort.resize(tlbPort_width); cuExitCallback = new CUExitCallback(this); @@ -144,12 +158,13 @@ ComputeUnit::~ComputeUnit() { // Delete wavefront slots - - for (int j = 0; j < numSIMDs; ++j) + for (int j = 0; j < numSIMDs; ++j) { for (int i = 0; i < shader->n_wf; ++i) { delete wfList[j][i]; } - + lastVaddrSimd[j].clear(); + } + lastVaddrCU.clear(); readyList.clear(); waveStatusList.clear(); dispatchList.clear(); @@ -187,27 +202,25 @@ VectorMask init_mask; init_mask.reset(); - for (int k = 0; k < VSZ; ++k) { - if (k + cnt * VSZ < trueWgSizeTotal) + for (int k = 0; k < wfSize(); ++k) { + if (k + cnt * wfSize() < trueWgSizeTotal) init_mask[k] = 1; } wfCtx->init_mask = init_mask.to_ullong(); wfCtx->exec_mask = init_mask.to_ullong(); - for (int i = 0; i < VSZ; ++i) { - wfCtx->bar_cnt[i] = 0; - } + wfCtx->bar_cnt.resize(wfSize(), 0); wfCtx->max_bar_cnt = 0; wfCtx->old_barrier_cnt = 0; wfCtx->barrier_cnt = 0; wfCtx->privBase = ndr->q.privMemStart; - ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ; + ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize(); wfCtx->spillBase = ndr->q.spillMemStart; - ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ; + ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize(); wfCtx->pc = 0; wfCtx->rpc = UINT32_MAX; @@ -265,10 +278,12 @@ w->dynwaveid = cnt; w->init_mask = wfCtx->init_mask; - for (int k = 0; k < VSZ; ++k) { - w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0]; - w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1]; - w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]); + for (int k = 0; k < wfSize(); ++k) { + w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0]; + w->workitemid[1][k] = ((k + cnt * wfSize()) / trueWgSize[0]) + % trueWgSize[1]; + w->workitemid[2][k] = (k + cnt * wfSize()) / + (trueWgSize[0] * trueWgSize[1]); w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] * trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] + @@ -277,9 +292,9 @@ w->old_barrier_cnt = wfCtx->old_barrier_cnt; w->barrier_cnt = wfCtx->barrier_cnt; - w->barrier_slots = divCeil(trueWgSizeTotal, VSZ); + w->barrier_slots = divCeil(trueWgSizeTotal, wfSize()); - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < wfSize(); ++i) { w->bar_cnt[i] = wfCtx->bar_cnt[i]; } @@ -315,16 +330,17 @@ // is this the last wavefront in the workgroup // if set the spillWidth to be the remaining work-items // so that the vector access is correct - if ((cnt + 1) * VSZ >= trueWgSizeTotal) { - w->spillWidth = trueWgSizeTotal - (cnt * VSZ); + if ((cnt + 1) * wfSize() >= trueWgSizeTotal) { + w->spillWidth = trueWgSizeTotal - (cnt * wfSize()); } else { - w->spillWidth = VSZ; + w->spillWidth = wfSize(); } DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); w->start(++_n_wave, ndr->q.code_ptr); + wfCtx->bar_cnt.clear(); } void @@ -339,7 +355,7 @@ // Send L1 cache acquire // isKernel + isAcquire = Kernel Begin if (shader->impl_kern_boundary_sync) { - GPUDynInstPtr gpuDynInst = std::make_shared(nullptr, + GPUDynInstPtr gpuDynInst = std::make_shared(this, nullptr, nullptr, 0); @@ -374,7 +390,7 @@ if (w->status == Wavefront::S_STOPPED) { // if we have scheduled all work items then stop // scheduling wavefronts - if (cnt * VSZ >= trueWgSizeTotal) + if (cnt * wfSize() >= trueWgSizeTotal) break; // reserve vector registers for the scheduled wavefront @@ -420,7 +436,7 @@ // work item of the work group int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount); bool vregAvail = true; - int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ; + int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize(); int freeWfSlots = 0; // check if the total number of VGPRs required by all WFs of the WG // fit in the VRFs of all SIMD units @@ -623,7 +639,7 @@ // Setup space for call args for (int j = 0; j < numSIMDs; ++j) { for (int i = 0; i < shader->n_wf; ++i) { - wfList[j][i]->initCallArgMem(shader->funcargs_size); + wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize); } } @@ -1193,15 +1209,15 @@ Addr last = 0; switch(computeUnit->prefetchType) { - case Enums::PF_CU: + case Enums::PF_CU: last = computeUnit->lastVaddrCU[mp_index]; break; - case Enums::PF_PHASE: - last = computeUnit->lastVaddrPhase[simdId][mp_index]; + case Enums::PF_PHASE: + last = computeUnit->lastVaddrSimd[simdId][mp_index]; break; - case Enums::PF_WF: + case Enums::PF_WF: last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index]; - default: + default: break; } @@ -1215,7 +1231,7 @@ DPRINTF(GPUPrefetch, "Stride is %d\n", stride); computeUnit->lastVaddrCU[mp_index] = vaddr; - computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr; + computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr; computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr; stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ? @@ -1486,7 +1502,7 @@ ; ldsBankConflictDist - .init(0, VSZ, 2) + .init(0, wfSize(), 2) .name(name() + ".lds_bank_conflicts") .desc("Number of bank conflicts per LDS memory packet") ; @@ -1497,27 +1513,28 @@ ; pageDivergenceDist - // A wavefront can touch 1 to VSZ pages per memory instruction. - // The number of pages per bin can be configured (here it's 4). - .init(1, VSZ, 4) + // A wavefront can touch up to N pages per memory instruction where + // N is equal to the wavefront size + // The number of pages per bin can be configured (here it's 4). + .init(1, wfSize(), 4) .name(name() + ".page_divergence_dist") .desc("pages touched per wf (over all mem. instr.)") ; controlFlowDivergenceDist - .init(1, VSZ, 4) + .init(1, wfSize(), 4) .name(name() + ".warp_execution_dist") .desc("number of lanes active per instruction (oval all instructions)") ; activeLanesPerGMemInstrDist - .init(1, VSZ, 4) + .init(1, wfSize(), 4) .name(name() + ".gmem_lanes_execution_dist") .desc("number of active lanes per global memory instruction") ; activeLanesPerLMemInstrDist - .init(1, VSZ, 4) + .init(1, wfSize(), 4) .name(name() + ".lmem_lanes_execution_dist") .desc("number of active lanes per local memory instruction") ; @@ -1529,7 +1546,7 @@ numVecOpsExecuted .name(name() + ".num_vec_ops_executed") - .desc("number of vec ops executed (e.g. VSZ/inst)") + .desc("number of vec ops executed (e.g. WF size/inst)") ; totalCycles diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh --- a/src/gpu-compute/dispatcher.hh +++ b/src/gpu-compute/dispatcher.hh @@ -157,6 +157,7 @@ // helper functions to retrieve/set GPU attributes int getNumCUs(); + int wfSize() const; void setFuncargsSize(int funcargs_size); }; diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -387,6 +387,12 @@ return shader->cuList.size(); } +int +GpuDispatcher::wfSize() const +{ + return shader->cuList[0]->wfSize(); +} + void GpuDispatcher::setFuncargsSize(int funcargs_size) { diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -179,9 +179,9 @@ int physVgpr = w->remap(dst, sizeof(c0), 1); // save the physical VGPR index regVec.push_back(physVgpr); - c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < w->computeUnit->wfSize(); ++i) { if (m->exec_mask[i]) { DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " "$%s%d <- %d global ld done (src = wavefront " diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -205,7 +205,7 @@ public: GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, uint64_t instSeqNum); - + ~GPUDynInst(); void execute(); int numSrcRegOperands(); int numDstRegOperands(); @@ -226,15 +226,15 @@ Enums::StorageClassType executedAs(); // The address of the memory operation - Addr addr[VSZ]; + std::vector addr; Addr pAddr; // The data to get written - uint8_t d_data[VSZ * 16]; + uint8_t *d_data; // Additional data (for atomics) - uint8_t a_data[VSZ * 8]; + uint8_t *a_data; // Additional data (for atomics) - uint8_t x_data[VSZ * 8]; + uint8_t *x_data; // The execution mask VectorMask exec_mask; diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -42,11 +42,29 @@ GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, uint64_t instSeqNum) - : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF), + : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0), + m_op(Enums::MO_UNDEF), memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false), statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum) { - tlbHitLevel.assign(VSZ, -1); + tlbHitLevel.assign(computeUnit()->wfSize(), -1); + d_data = new uint8_t[computeUnit()->wfSize() * 16]; + a_data = new uint8_t[computeUnit()->wfSize() * 8]; + x_data = new uint8_t[computeUnit()->wfSize() * 8]; + for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) { + a_data[i] = 0; + x_data[i] = 0; + } + for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) { + d_data[i] = 0; + } +} + +GPUDynInst::~GPUDynInst() +{ + delete[] d_data; + delete[] a_data; + delete[] x_data; } void diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -148,9 +148,9 @@ int physVgpr = w->remap(dst,sizeof(c0),1); // save the physical VGPR index regVec.push_back(physVgpr); - c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()]; - for (int i = 0; i < VSZ; ++i) { + for (int i = 0; i < w->computeUnit->wfSize(); ++i) { if (m->exec_mask[i]) { // write the value into the physical VGPR. This is a purely // functional operation. No timing is modeled. diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh --- a/src/gpu-compute/misc.hh +++ b/src/gpu-compute/misc.hh @@ -38,27 +38,13 @@ #include #include +#include #include "base/misc.hh" class GPUDynInst; -// wavefront size of the machine -static const int VSZ = 64; - -/* - This check is necessary because std::bitset only provides conversion to - unsigned long or unsigned long long via to_ulong() or to_ullong(). there are - a few places in the code where to_ullong() is used, however if VSZ is larger - than a value the host can support then bitset will throw a runtime exception. - - we should remove all use of to_long() or to_ullong() so we can have VSZ - greater than 64b, however until that is done this assert is required. - */ -static_assert(VSZ <= sizeof(unsigned long long) * 8, - "VSZ is larger than the host can support"); - -typedef std::bitset VectorMask; +typedef std::bitset::digits> VectorMask; typedef std::shared_ptr GPUDynInstPtr; class WaitClass diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh --- a/src/gpu-compute/qstruct.hh +++ b/src/gpu-compute/qstruct.hh @@ -100,7 +100,7 @@ { // 32 bit values // barrier state - int bar_cnt[VSZ]; + std::vector bar_cnt; // id (which WF in the WG) int cnt; diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc --- a/src/gpu-compute/vector_register_file.cc +++ b/src/gpu-compute/vector_register_file.cc @@ -63,7 +63,7 @@ nxtBusy.clear(); nxtBusy.resize(numRegsPerSimd, 0); - vgprState->init(numRegsPerSimd); + vgprState->init(numRegsPerSimd, p->wfSize); } void diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh --- a/src/gpu-compute/vector_register_state.hh +++ b/src/gpu-compute/vector_register_state.hh @@ -51,7 +51,7 @@ { public: VecRegisterState(); - void init(uint32_t _size); + void init(uint32_t _size, uint32_t wf_size); const std::string& name() const { return _name; } void setParent(ComputeUnit *_computeUnit); @@ -93,9 +93,9 @@ ComputeUnit *computeUnit; std::string _name; // 32-bit Single Precision Vector Register State - std::vector> s_reg; + std::vector> s_reg; // 64-bit Double Precision Vector Register State - std::vector> d_reg; + std::vector> d_reg; }; #endif // __VECTOR_REGISTER_STATE_HH__ diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc --- a/src/gpu-compute/vector_register_state.cc +++ b/src/gpu-compute/vector_register_state.cc @@ -35,6 +35,8 @@ #include "gpu-compute/vector_register_state.hh" +#include + #include "gpu-compute/compute_unit.hh" VecRegisterState::VecRegisterState() : computeUnit(nullptr) @@ -51,8 +53,19 @@ } void -VecRegisterState::init(uint32_t _size) +VecRegisterState::init(uint32_t _size, uint32_t wf_size) { s_reg.resize(_size); + fatal_if(wf_size > std::numeric_limits::digits || + wf_size <= 0, + "WF size is larger than the host can support or is zero"); + fatal_if((wf_size & (wf_size - 1)) != 0, + "Wavefront size should be a power of 2"); + for (int i = 0; i < s_reg.size(); ++i) { + s_reg[i].resize(wf_size, 0); + } d_reg.resize(_size); + for (int i = 0; i < d_reg.size(); ++i) { + d_reg[i].resize(wf_size, 0); + } }