# Node ID 5695d76ec68ba1c4ec49fafda9c859552ae53bba # Parent 707f0168d0fb0cfbfa0a31ff54132f1693f81197 diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc --- a/src/gpu-compute/fetch_unit.cc +++ b/src/gpu-compute/fetch_unit.cc @@ -114,7 +114,18 @@ FetchUnit::initiateFetch(Wavefront *wavefront) { // calculate the virtual address to fetch from the SQC - Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size(); + Addr vaddr = wavefront->pc(); + + /** + * the instruction buffer holds one instruction per entry, regardless + * of the underlying instruction's size. the PC, however, addresses + * instrutions on a 32b granularity so we must account for that here. + */ + for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) { + int current_inst_size = + wavefront->instructionBuffer.at(i)->staticInstruction()->instSize(); + vaddr += current_inst_size / sizeof(uint32_t); + } vaddr = wavefront->basePtr + vaddr * sizeof(GPUStaticInst*); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", @@ -267,6 +278,18 @@ GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]); assert(inst_ptr); + + if (inst_ptr->instSize() == 8) { + /** + * this instruction occupies 2 consecutive + * entries in the instruction array, the + * second of which contains a nullptr. so if + * this inst is 8 bytes we advance two entries + * instead of 1 + */ + ++i; + } + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n", computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, inst_ptr->disassemble()); diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -670,7 +670,7 @@ computeUnit->lastExecCycle[simdId]); computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); if (pc() == old_pc) { - uint32_t new_pc = old_pc + 1; + uint32_t new_pc = _gpuISA.advancePC(old_pc, ii); // PC not modified by instruction, proceed to next or pop frame pc(new_pc); if (new_pc == rpc()) {