20240804/ modify ptx_exec_inst
unknown
c_cpp
a month ago
14 kB
4
Indexable
Never
void ptx_thread_info::ptx_exec_inst(warp_inst_t &inst, unsigned lane_id) { bool skip = false; int op_classification = 0; addr_t pc = next_instr(); assert(pc == inst.pc); // make sure timing model and functional model are in sync const ptx_instruction *pI = m_func_info->get_instruction(pc); set_npc(pc + pI->inst_size()); try { clearRPC(); m_last_set_operand_value.u64 = 0; if (is_done()) { printf( "attempted to execute instruction on a thread that is already " "done.\n"); assert(0); } if (g_debug_execution >= 6 || m_gpu->get_config().get_ptx_inst_debug_to_file()) { if ((m_gpu->gpgpu_ctx->func_sim->g_debug_thread_uid == 0) || (get_uid() == (unsigned)(m_gpu->gpgpu_ctx->func_sim->g_debug_thread_uid))) { clear_modifiedregs(); enable_debug_trace(); } } if (pI->has_pred()) { const operand_info &pred = pI->get_pred(); ptx_reg_t pred_value = get_operand_value(pred, pred, PRED_TYPE, this, 0); if (pI->get_pred_mod() == -1) { skip = (pred_value.pred & 0x0001) ^ pI->get_pred_neg(); // ptxplus inverts the zero flag } else { skip = !pred_lookup(pI->get_pred_mod(), pred_value.pred & 0x000F); } } int inst_opcode = pI->get_opcode(); if (skip) { inst.set_not_active(lane_id); } else { const ptx_instruction *pI_saved = pI; ptx_instruction *pJ = NULL; if (pI->get_opcode() == VOTE_OP || pI->get_opcode() == ACTIVEMASK_OP) { pJ = new ptx_instruction(*pI); *((warp_inst_t *)pJ) = inst; // copy active mask information pI = pJ; } //! try to check mma.load address // Add this check for MMA load operations if (pI->get_opcode() == MMA_LD_OP) { printf("PTX Instruction: %s (wmma::load_matrix_sync) at %s:%u\n", pI->get_opcode_cstr(), pI->source_file(), pI->source_line()); addr_t addr = last_eaddr(); memory_space_t space = last_space(); // Function to convert memory_space_t to string auto space_to_string = [](const memory_space_t &space) { switch (space.get_type()) { case undefined_space: return "undefined"; case reg_space: return "register"; case local_space: return "local"; case shared_space: return "shared"; case sstarr_space: return "sstarr"; case param_space_unclassified: return "param (unclassified)"; case param_space_kernel: return "param (kernel)"; case param_space_local: return "param (local)"; case const_space: return "constant"; case tex_space: return "texture"; case surf_space: return "surface"; case global_space: return "global"; case generic_space: return "generic"; case instruction_space: return "instruction"; default: return "unknown"; } }; printf(" MMA Load: Address 0x%llx\n", (unsigned long long)addr); printf(" Memory Space: %s\n", space_to_string(space)); printf(" Memory Bank: %u\n", space.get_bank()); // // Additional checks based on memory space // if (space.is_const()) { // printf(" This is a constant memory access\n"); // } else if (space.is_local()) { // printf(" This is a local memory access\n"); // } else if (space.is_global()) { // printf(" This is a global memory access\n"); // } // // Use last_eaddr() to get the memory address // addr_t addr = last_eaddr(); // printf(" Loading from address 0x%llx\n", (unsigned long long)addr); // // If you want to print the size of the data being loaded // unsigned to_type = pI->get_type(); // unsigned data_size = datatype2size(to_type); // printf(" Data size: %u bytes\n", data_size); } if (((inst_opcode == MMA_OP || inst_opcode == MMA_LD_OP || inst_opcode == MMA_ST_OP))) { if (inst.active_count() != MAX_WARP_SIZE) { printf( "Tensor Core operation are warp synchronous operation. All the " "threads needs to be active."); assert(0); } } // Tensorcore is warp synchronous operation. So these instructions needs // to be executed only once. To make the simulation faster removing the // redundant tensorcore operation if (!tensorcore_op(inst_opcode) || ((tensorcore_op(inst_opcode)) && (lane_id == 0))) { switch (inst_opcode) { #define OP_DEF(OP, FUNC, STR, DST, CLASSIFICATION) \ case OP: \ FUNC(pI, this); \ op_classification = CLASSIFICATION; \ break; #define OP_W_DEF(OP, FUNC, STR, DST, CLASSIFICATION) \ case OP: \ FUNC(pI, get_core(), inst); \ op_classification = CLASSIFICATION; \ break; #include "opcodes.def" #undef OP_DEF #undef OP_W_DEF default: printf("Execution error: Invalid opcode (0x%x)\n", pI->get_opcode()); break; } } delete pJ; pI = pI_saved; // Run exit instruction if exit option included if (pI->is_exit()) exit_impl(pI, this); } const gpgpu_functional_sim_config &config = m_gpu->get_config(); // Output instruction information to file and stdout if (config.get_ptx_inst_debug_to_file() != 0 && (config.get_ptx_inst_debug_thread_uid() == 0 || config.get_ptx_inst_debug_thread_uid() == get_uid())) { fprintf(m_gpu->get_ptx_inst_debug_file(), "[thd=%u] : (%s:%u - %s)\n", get_uid(), pI->source_file(), pI->source_line(), pI->get_source()); // fprintf(ptx_inst_debug_file, "has memory read=%d, has memory // write=%d\n", pI->has_memory_read(), pI->has_memory_write()); fflush(m_gpu->get_ptx_inst_debug_file()); } if (m_gpu->gpgpu_ctx->func_sim->ptx_debug_exec_dump_cond<5>(get_uid(), pc)) { dim3 ctaid = get_ctaid(); dim3 tid = get_tid(); printf( "%u [thd=%u][i=%u] : ctaid=(%u,%u,%u) tid=(%u,%u,%u) icount=%u " "[pc=%u] (%s:%u - %s) [0x%llx]\n", m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn, get_uid(), pI->uid(), ctaid.x, ctaid.y, ctaid.z, tid.x, tid.y, tid.z, get_icount(), pc, pI->source_file(), pI->source_line(), pI->get_source(), m_last_set_operand_value.u64); fflush(stdout); } addr_t insn_memaddr = 0xFEEBDAED; memory_space_t insn_space = undefined_space; _memory_op_t insn_memory_op = no_memory_op; unsigned insn_data_size = 0; if ((pI->has_memory_read() || pI->has_memory_write())) { if (!((inst_opcode == MMA_LD_OP || inst_opcode == MMA_ST_OP))) { insn_memaddr = last_eaddr(); insn_space = last_space(); unsigned to_type = pI->get_type(); insn_data_size = datatype2size(to_type); insn_memory_op = pI->has_memory_read() ? memory_load : memory_store; // Function to convert memory_space_t to string auto space_to_string = [](const memory_space_t &space) { switch (space.get_type()) { case undefined_space: return "undefined"; case reg_space: return "register"; case local_space: return "local"; case shared_space: return "shared"; case sstarr_space: return "sstarr"; case param_space_unclassified: return "param (unclassified)"; case param_space_kernel: return "param (kernel)"; case param_space_local: return "param (local)"; case const_space: return "constant"; case tex_space: return "texture"; case surf_space: return "surface"; case global_space: return "global"; case generic_space: return "generic"; case instruction_space: return "instruction"; default: return "unknown"; } }; //* Add this print statement if (pI->has_memory_read()) { printf( "PTX Instruction: %s (at %s:%u) is reading memory. Address: " "0x%llx, Size: %u bytes, Space: %s\n", pI->get_opcode_cstr(), pI->source_file(), pI->source_line(), (unsigned long long)insn_memaddr, insn_data_size, space_to_string(insn_space)); } } } if (pI->get_opcode() == BAR_OP && pI->barrier_op() == RED_OPTION) { inst.add_callback(lane_id, last_callback().function, last_callback().instruction, this, false /*not atomic*/); } if (pI->get_opcode() == ATOM_OP) { insn_memaddr = last_eaddr(); insn_space = last_space(); inst.add_callback(lane_id, last_callback().function, last_callback().instruction, this, true /*atomic*/); unsigned to_type = pI->get_type(); insn_data_size = datatype2size(to_type); } if (pI->get_opcode() == TEX_OP) { inst.set_addr(lane_id, last_eaddr()); assert(inst.space == last_space()); insn_data_size = get_tex_datasize( pI, this); // texture obtain its data granularity from the texture info } // Output register information to file and stdout if (config.get_ptx_inst_debug_to_file() != 0 && (config.get_ptx_inst_debug_thread_uid() == 0 || config.get_ptx_inst_debug_thread_uid() == get_uid())) { dump_modifiedregs(m_gpu->get_ptx_inst_debug_file()); dump_regs(m_gpu->get_ptx_inst_debug_file()); } if (g_debug_execution >= 6) { if (m_gpu->gpgpu_ctx->func_sim->ptx_debug_exec_dump_cond<6>(get_uid(), pc)) dump_modifiedregs(stdout); } if (g_debug_execution >= 10) { if (m_gpu->gpgpu_ctx->func_sim->ptx_debug_exec_dump_cond<10>(get_uid(), pc)) dump_regs(stdout); } update_pc(); m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn++; // not using it with functional simulation mode if (!(this->m_functionalSimulationMode)) ptx_file_line_stats_add_exec_count(pI); if (m_gpu->gpgpu_ctx->func_sim->gpgpu_ptx_instruction_classification) { m_gpu->gpgpu_ctx->func_sim->init_inst_classification_stat(); unsigned space_type = 0; switch (pI->get_space().get_type()) { case global_space: space_type = 10; break; case local_space: space_type = 11; break; case tex_space: space_type = 12; break; case surf_space: space_type = 13; break; case param_space_kernel: case param_space_local: space_type = 14; break; case shared_space: space_type = 15; break; case const_space: space_type = 16; break; default: space_type = 0; break; } StatAddSample(m_gpu->gpgpu_ctx->func_sim->g_inst_classification_stat [m_gpu->gpgpu_ctx->func_sim->g_ptx_kernel_count], op_classification); if (space_type) StatAddSample(m_gpu->gpgpu_ctx->func_sim->g_inst_classification_stat [m_gpu->gpgpu_ctx->func_sim->g_ptx_kernel_count], (int)space_type); StatAddSample(m_gpu->gpgpu_ctx->func_sim->g_inst_op_classification_stat [m_gpu->gpgpu_ctx->func_sim->g_ptx_kernel_count], (int)pI->get_opcode()); } if ((m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn % 100000) == 0) { dim3 ctaid = get_ctaid(); dim3 tid = get_tid(); DPRINTF(LIVENESS, "GPGPU-Sim PTX: %u instructions simulated : ctaid=(%u,%u,%u) " "tid=(%u,%u,%u)\n", m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn, ctaid.x, ctaid.y, ctaid.z, tid.x, tid.y, tid.z); fflush(stdout); } // "Return values" if (!skip) { if (!((inst_opcode == MMA_LD_OP || inst_opcode == MMA_ST_OP))) { inst.space = insn_space; inst.set_addr(lane_id, insn_memaddr); inst.data_size = insn_data_size; // simpleAtomicIntrinsics assert(inst.memory_op == insn_memory_op); } } } catch (int x) { printf("GPGPU-Sim PTX: ERROR (%d) executing intruction (%s:%u)\n", x, pI->source_file(), pI->source_line()); printf("GPGPU-Sim PTX: '%s'\n", pI->get_source()); abort(); } }
Leave a Comment