20240804/ modify ptx_exec_inst
unknown
c_cpp
a year ago
14 kB
12
Indexable
void ptx_thread_info::ptx_exec_inst(warp_inst_t &inst, unsigned lane_id) {
bool skip = false;
int op_classification = 0;
addr_t pc = next_instr();
assert(pc ==
inst.pc); // make sure timing model and functional model are in sync
const ptx_instruction *pI = m_func_info->get_instruction(pc);
set_npc(pc + pI->inst_size());
try {
clearRPC();
m_last_set_operand_value.u64 = 0;
if (is_done()) {
printf(
"attempted to execute instruction on a thread that is already "
"done.\n");
assert(0);
}
if (g_debug_execution >= 6 ||
m_gpu->get_config().get_ptx_inst_debug_to_file()) {
if ((m_gpu->gpgpu_ctx->func_sim->g_debug_thread_uid == 0) ||
(get_uid() ==
(unsigned)(m_gpu->gpgpu_ctx->func_sim->g_debug_thread_uid))) {
clear_modifiedregs();
enable_debug_trace();
}
}
if (pI->has_pred()) {
const operand_info &pred = pI->get_pred();
ptx_reg_t pred_value = get_operand_value(pred, pred, PRED_TYPE, this, 0);
if (pI->get_pred_mod() == -1) {
skip = (pred_value.pred & 0x0001) ^
pI->get_pred_neg(); // ptxplus inverts the zero flag
} else {
skip = !pred_lookup(pI->get_pred_mod(), pred_value.pred & 0x000F);
}
}
int inst_opcode = pI->get_opcode();
if (skip) {
inst.set_not_active(lane_id);
} else {
const ptx_instruction *pI_saved = pI;
ptx_instruction *pJ = NULL;
if (pI->get_opcode() == VOTE_OP || pI->get_opcode() == ACTIVEMASK_OP) {
pJ = new ptx_instruction(*pI);
*((warp_inst_t *)pJ) = inst; // copy active mask information
pI = pJ;
}
//! try to check mma.load address
// Add this check for MMA load operations
if (pI->get_opcode() == MMA_LD_OP) {
printf("PTX Instruction: %s (wmma::load_matrix_sync) at %s:%u\n",
pI->get_opcode_cstr(), pI->source_file(), pI->source_line());
addr_t addr = last_eaddr();
memory_space_t space = last_space();
// Function to convert memory_space_t to string
auto space_to_string = [](const memory_space_t &space) {
switch (space.get_type()) {
case undefined_space:
return "undefined";
case reg_space:
return "register";
case local_space:
return "local";
case shared_space:
return "shared";
case sstarr_space:
return "sstarr";
case param_space_unclassified:
return "param (unclassified)";
case param_space_kernel:
return "param (kernel)";
case param_space_local:
return "param (local)";
case const_space:
return "constant";
case tex_space:
return "texture";
case surf_space:
return "surface";
case global_space:
return "global";
case generic_space:
return "generic";
case instruction_space:
return "instruction";
default:
return "unknown";
}
};
printf(" MMA Load: Address 0x%llx\n", (unsigned long long)addr);
printf(" Memory Space: %s\n", space_to_string(space));
printf(" Memory Bank: %u\n", space.get_bank());
// // Additional checks based on memory space
// if (space.is_const()) {
// printf(" This is a constant memory access\n");
// } else if (space.is_local()) {
// printf(" This is a local memory access\n");
// } else if (space.is_global()) {
// printf(" This is a global memory access\n");
// }
// // Use last_eaddr() to get the memory address
// addr_t addr = last_eaddr();
// printf(" Loading from address 0x%llx\n", (unsigned long long)addr);
// // If you want to print the size of the data being loaded
// unsigned to_type = pI->get_type();
// unsigned data_size = datatype2size(to_type);
// printf(" Data size: %u bytes\n", data_size);
}
if (((inst_opcode == MMA_OP || inst_opcode == MMA_LD_OP ||
inst_opcode == MMA_ST_OP))) {
if (inst.active_count() != MAX_WARP_SIZE) {
printf(
"Tensor Core operation are warp synchronous operation. All the "
"threads needs to be active.");
assert(0);
}
}
// Tensorcore is warp synchronous operation. So these instructions needs
// to be executed only once. To make the simulation faster removing the
// redundant tensorcore operation
if (!tensorcore_op(inst_opcode) ||
((tensorcore_op(inst_opcode)) && (lane_id == 0))) {
switch (inst_opcode) {
#define OP_DEF(OP, FUNC, STR, DST, CLASSIFICATION) \
case OP: \
FUNC(pI, this); \
op_classification = CLASSIFICATION; \
break;
#define OP_W_DEF(OP, FUNC, STR, DST, CLASSIFICATION) \
case OP: \
FUNC(pI, get_core(), inst); \
op_classification = CLASSIFICATION; \
break;
#include "opcodes.def"
#undef OP_DEF
#undef OP_W_DEF
default:
printf("Execution error: Invalid opcode (0x%x)\n",
pI->get_opcode());
break;
}
}
delete pJ;
pI = pI_saved;
// Run exit instruction if exit option included
if (pI->is_exit()) exit_impl(pI, this);
}
const gpgpu_functional_sim_config &config = m_gpu->get_config();
// Output instruction information to file and stdout
if (config.get_ptx_inst_debug_to_file() != 0 &&
(config.get_ptx_inst_debug_thread_uid() == 0 ||
config.get_ptx_inst_debug_thread_uid() == get_uid())) {
fprintf(m_gpu->get_ptx_inst_debug_file(), "[thd=%u] : (%s:%u - %s)\n",
get_uid(), pI->source_file(), pI->source_line(),
pI->get_source());
// fprintf(ptx_inst_debug_file, "has memory read=%d, has memory
// write=%d\n", pI->has_memory_read(), pI->has_memory_write());
fflush(m_gpu->get_ptx_inst_debug_file());
}
if (m_gpu->gpgpu_ctx->func_sim->ptx_debug_exec_dump_cond<5>(get_uid(),
pc)) {
dim3 ctaid = get_ctaid();
dim3 tid = get_tid();
printf(
"%u [thd=%u][i=%u] : ctaid=(%u,%u,%u) tid=(%u,%u,%u) icount=%u "
"[pc=%u] (%s:%u - %s) [0x%llx]\n",
m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn, get_uid(), pI->uid(),
ctaid.x, ctaid.y, ctaid.z, tid.x, tid.y, tid.z, get_icount(), pc,
pI->source_file(), pI->source_line(), pI->get_source(),
m_last_set_operand_value.u64);
fflush(stdout);
}
addr_t insn_memaddr = 0xFEEBDAED;
memory_space_t insn_space = undefined_space;
_memory_op_t insn_memory_op = no_memory_op;
unsigned insn_data_size = 0;
if ((pI->has_memory_read() || pI->has_memory_write())) {
if (!((inst_opcode == MMA_LD_OP || inst_opcode == MMA_ST_OP))) {
insn_memaddr = last_eaddr();
insn_space = last_space();
unsigned to_type = pI->get_type();
insn_data_size = datatype2size(to_type);
insn_memory_op = pI->has_memory_read() ? memory_load : memory_store;
// Function to convert memory_space_t to string
auto space_to_string = [](const memory_space_t &space) {
switch (space.get_type()) {
case undefined_space:
return "undefined";
case reg_space:
return "register";
case local_space:
return "local";
case shared_space:
return "shared";
case sstarr_space:
return "sstarr";
case param_space_unclassified:
return "param (unclassified)";
case param_space_kernel:
return "param (kernel)";
case param_space_local:
return "param (local)";
case const_space:
return "constant";
case tex_space:
return "texture";
case surf_space:
return "surface";
case global_space:
return "global";
case generic_space:
return "generic";
case instruction_space:
return "instruction";
default:
return "unknown";
}
};
//* Add this print statement
if (pI->has_memory_read()) {
printf(
"PTX Instruction: %s (at %s:%u) is reading memory. Address: "
"0x%llx, Size: %u bytes, Space: %s\n",
pI->get_opcode_cstr(), pI->source_file(), pI->source_line(),
(unsigned long long)insn_memaddr, insn_data_size,
space_to_string(insn_space));
}
}
}
if (pI->get_opcode() == BAR_OP && pI->barrier_op() == RED_OPTION) {
inst.add_callback(lane_id, last_callback().function,
last_callback().instruction, this,
false /*not atomic*/);
}
if (pI->get_opcode() == ATOM_OP) {
insn_memaddr = last_eaddr();
insn_space = last_space();
inst.add_callback(lane_id, last_callback().function,
last_callback().instruction, this, true /*atomic*/);
unsigned to_type = pI->get_type();
insn_data_size = datatype2size(to_type);
}
if (pI->get_opcode() == TEX_OP) {
inst.set_addr(lane_id, last_eaddr());
assert(inst.space == last_space());
insn_data_size = get_tex_datasize(
pI,
this); // texture obtain its data granularity from the texture info
}
// Output register information to file and stdout
if (config.get_ptx_inst_debug_to_file() != 0 &&
(config.get_ptx_inst_debug_thread_uid() == 0 ||
config.get_ptx_inst_debug_thread_uid() == get_uid())) {
dump_modifiedregs(m_gpu->get_ptx_inst_debug_file());
dump_regs(m_gpu->get_ptx_inst_debug_file());
}
if (g_debug_execution >= 6) {
if (m_gpu->gpgpu_ctx->func_sim->ptx_debug_exec_dump_cond<6>(get_uid(),
pc))
dump_modifiedregs(stdout);
}
if (g_debug_execution >= 10) {
if (m_gpu->gpgpu_ctx->func_sim->ptx_debug_exec_dump_cond<10>(get_uid(),
pc))
dump_regs(stdout);
}
update_pc();
m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn++;
// not using it with functional simulation mode
if (!(this->m_functionalSimulationMode))
ptx_file_line_stats_add_exec_count(pI);
if (m_gpu->gpgpu_ctx->func_sim->gpgpu_ptx_instruction_classification) {
m_gpu->gpgpu_ctx->func_sim->init_inst_classification_stat();
unsigned space_type = 0;
switch (pI->get_space().get_type()) {
case global_space:
space_type = 10;
break;
case local_space:
space_type = 11;
break;
case tex_space:
space_type = 12;
break;
case surf_space:
space_type = 13;
break;
case param_space_kernel:
case param_space_local:
space_type = 14;
break;
case shared_space:
space_type = 15;
break;
case const_space:
space_type = 16;
break;
default:
space_type = 0;
break;
}
StatAddSample(m_gpu->gpgpu_ctx->func_sim->g_inst_classification_stat
[m_gpu->gpgpu_ctx->func_sim->g_ptx_kernel_count],
op_classification);
if (space_type)
StatAddSample(m_gpu->gpgpu_ctx->func_sim->g_inst_classification_stat
[m_gpu->gpgpu_ctx->func_sim->g_ptx_kernel_count],
(int)space_type);
StatAddSample(m_gpu->gpgpu_ctx->func_sim->g_inst_op_classification_stat
[m_gpu->gpgpu_ctx->func_sim->g_ptx_kernel_count],
(int)pI->get_opcode());
}
if ((m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn % 100000) == 0) {
dim3 ctaid = get_ctaid();
dim3 tid = get_tid();
DPRINTF(LIVENESS,
"GPGPU-Sim PTX: %u instructions simulated : ctaid=(%u,%u,%u) "
"tid=(%u,%u,%u)\n",
m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn, ctaid.x, ctaid.y,
ctaid.z, tid.x, tid.y, tid.z);
fflush(stdout);
}
// "Return values"
if (!skip) {
if (!((inst_opcode == MMA_LD_OP || inst_opcode == MMA_ST_OP))) {
inst.space = insn_space;
inst.set_addr(lane_id, insn_memaddr);
inst.data_size = insn_data_size; // simpleAtomicIntrinsics
assert(inst.memory_op == insn_memory_op);
}
}
} catch (int x) {
printf("GPGPU-Sim PTX: ERROR (%d) executing intruction (%s:%u)\n", x,
pI->source_file(), pI->source_line());
printf("GPGPU-Sim PTX: '%s'\n", pI->get_source());
abort();
}
}
Editor is loading...
Leave a Comment