diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -301,6 +301,31 @@ LdsState &lds; public: + Stats::Scalar VALUInsts; + Stats::Formula VALUInstsPerSIMD; + Stats::Scalar SALUInsts; + Stats::Formula SALUInstsPerSIMD; + Stats::Scalar instCyclesSALU; + Stats::Scalar instCyclesVALU; + Stats::Scalar threadCyclesVALU; + Stats::Formula VALUUtilization; + Stats::Scalar LDSInsts; + Stats::Scalar LDSInstsNoFlat; + Stats::Formula LDSInstsPerSIMD; + Stats::Formula LDSInstsNoFlatPerSIMD; + Stats::Scalar flatInsts; + Stats::Formula flatInstsPerSIMD; + Stats::Scalar vectorWrites; + Stats::Formula vectorWritesPerSIMD; + Stats::Scalar vectorReads; + Stats::Formula vectorReadsPerSIMD; + Stats::Scalar scalarWrites; + Stats::Formula scalarWritesPerSIMD; + Stats::Scalar scalarReads; + Stats::Formula scalarReadsPerSIMD; + + void updateInstStats(GPUDynInstPtr gpuDynInst); + // the following stats compute the avg. TLB accesslatency per // uncoalesced request (only for data) Stats::Scalar tlbRequests; # Node ID 4ce20a015e53479f669f319ed82638e129fd6189 # Parent 2ef3fe5c3f4423d7253eb65bd27605a3f9885968 diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1408,6 +1408,108 @@ { MemObject::regStats(); + VALUInsts + .name(name() + ".valu_insts") + .desc("Number of vector ALU insts issued.") + ; + VALUInstsPerSIMD + .name(name() + ".valu_insts_per_simd") + .desc("Number of vector ALU insts issued (per-simd).") + ; + SALUInsts + .name(name() + ".salu_insts") + .desc("Number of scalar ALU insts issued.") + ; + SALUInstsPerSIMD + .name(name() + ".salu_insts_per_simd") + .desc("Number of scalar ALU insts issued (per-simd).") + ; + instCyclesVALU + .name(name() + ".inst_cycles_VALU") + .desc("Number of cycles needed to execute VALU insts.") + ; + threadCyclesVALU + .name(name() + ".thread_cycles_VALU") + .desc("Number of thread cycles used to execute vector ALU ops. " + "Similar to instCyclesVALU but multiplied by the number of " + "active threads.") + ; + VALUUtilization + .name(name() + ".VALU_utilization") + .desc("Percentage of active vector ALU threads in a wave.") + ; + LDSInsts + .name(name() + ".lds_insts") + .desc("Number of LDS insts issued, not including FLAT insts.") + ; + LDSInstsNoFlat + .name(name() + ".lds_no_flat_insts") + .desc("Number of LDS insts issued, not including FLAT " + "accesses that resolve to LDS.") + ; + LDSInstsPerSIMD + .name(name() + ".lds_insts_per_simd") + .desc("The average number of LDS insts per SIMD.") + ; + LDSInstsNoFlatPerSIMD + .name(name() + ".lds_no_flat_insts_per_simd") + .desc("The average number of LDS insts (not including FLAT " + "accesses that resolve to LDS) per SIMD.") + ; + flatInsts + .name(name() + ".flat_insts") + .desc("The number of FLAT insts issued.") + ; + flatInstsPerSIMD + .name(name() + ".flag_insts_per_simd") + .desc("The average number of FLAT insts per SIMD.") + ; + vectorWrites + .name(name() + ".vector_writes") + .desc("Number of vector write insts (excluding FLAT insts).") + ; + vectorWritesPerSIMD + .name(name() + ".vector_writes_per_simd") + .desc("The average number of vector write insts (excluding FLAT insts) " + "per SIMD.") + ; + vectorReads + .name(name() + ".vector_reads") + .desc("Number of vector read insts (excluding FLAT insts).") + ; + vectorReadsPerSIMD + .name(name() + ".vector_reads_per_simd") + .desc("The average number of vector read insts (excluding FLAT insts) " + "per SIMD.") + ; + scalarWrites + .name(name() + ".scalar_writes") + .desc("Number of scalar write insts.") + ; + scalarWritesPerSIMD + .name(name() + ".scalar_writes_per_simd") + .desc("The average number of scalar write insts per SIMD.") + ; + scalarReads + .name(name() + ".scalar_reads") + .desc("Number of scalar read insts.") + ; + scalarReadsPerSIMD + .name(name() + ".scalar_reads_per_simd") + .desc("The average number of scalar read insts per SIMD.") + ; + + VALUInstsPerSIMD = VALUInsts / completedWfs; + SALUInstsPerSIMD = SALUInsts / completedWfs; + VALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100; + LDSInstsPerSIMD = LDSInsts / completedWfs; + LDSInstsNoFlatPerSIMD = LDSInstsNoFlat / completedWfs; + flatInstsPerSIMD = flatInsts / completedWfs; + vectorWritesPerSIMD = vectorWrites / completedWfs; + vectorReadsPerSIMD = vectorReads / completedWfs; + scalarWritesPerSIMD = scalarWrites / completedWfs; + scalarReadsPerSIMD = scalarReads / completedWfs; + tlbCycles .name(name() + ".tlb_cycles") .desc("total number of cycles for all uncoalesced requests") @@ -1567,6 +1669,45 @@ } void +ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) +{ + if (gpuDynInst->isScalar()) { + if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) { + SALUInsts++; + instCyclesSALU++; + } else if (gpuDynInst->isLoad()) { + scalarReads++; + } else if (gpuDynInst->isStore()) { + scalarWrites++; + } + } else { + if (gpuDynInst->isALU()) { + VALUInsts++; + instCyclesVALU++; + for (int lane = 0; lane < wavefrontSize; ++lane) { + if (gpuDynInst->wavefront()->execMask(lane)) { + threadCyclesVALU++; + } + } + } else if (gpuDynInst->isFlat()) { + flatInsts++; + if (gpuDynInst->isLocalMem()) { + LDSInsts++; + } + } else if (gpuDynInst->isLocalMem()) { + LDSInsts++; + LDSInstsNoFlat++; + } + + if (gpuDynInst->isLoad()) { + vectorReads++; + } else if (gpuDynInst->isStore()) { + vectorWrites++; + } + } +} + +void ComputeUnit::updatePageDivergenceDist(Addr addr) { Addr virt_page_addr = roundDown(addr, TheISA::PageBytes); diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -656,7 +656,11 @@ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, ii->disassemble(), old_pc); + + // update the instruction stats in the CU + ii->execute(ii); + computeUnit->updateInstStats(ii); // access the VRF computeUnit->vrf[simdId]->exec(ii, this); srcRegOpDist.sample(ii->numSrcRegOperands());