20240804/ modify ptx_exec_inst

mail@pastecode.io avatar
unknown
c_cpp
5 months ago
14 kB
4
Indexable
void ptx_thread_info::ptx_exec_inst(warp_inst_t &inst, unsigned lane_id) {
  bool skip = false;
  int op_classification = 0;
  addr_t pc = next_instr();
  assert(pc ==
         inst.pc);  // make sure timing model and functional model are in sync
  const ptx_instruction *pI = m_func_info->get_instruction(pc);

  set_npc(pc + pI->inst_size());

  try {
    clearRPC();
    m_last_set_operand_value.u64 = 0;

    if (is_done()) {
      printf(
          "attempted to execute instruction on a thread that is already "
          "done.\n");
      assert(0);
    }

    if (g_debug_execution >= 6 ||
        m_gpu->get_config().get_ptx_inst_debug_to_file()) {
      if ((m_gpu->gpgpu_ctx->func_sim->g_debug_thread_uid == 0) ||
          (get_uid() ==
           (unsigned)(m_gpu->gpgpu_ctx->func_sim->g_debug_thread_uid))) {
        clear_modifiedregs();
        enable_debug_trace();
      }
    }

    if (pI->has_pred()) {
      const operand_info &pred = pI->get_pred();
      ptx_reg_t pred_value = get_operand_value(pred, pred, PRED_TYPE, this, 0);
      if (pI->get_pred_mod() == -1) {
        skip = (pred_value.pred & 0x0001) ^
               pI->get_pred_neg();  // ptxplus inverts the zero flag
      } else {
        skip = !pred_lookup(pI->get_pred_mod(), pred_value.pred & 0x000F);
      }
    }
    int inst_opcode = pI->get_opcode();

    if (skip) {
      inst.set_not_active(lane_id);
    } else {
      const ptx_instruction *pI_saved = pI;
      ptx_instruction *pJ = NULL;
      if (pI->get_opcode() == VOTE_OP || pI->get_opcode() == ACTIVEMASK_OP) {
        pJ = new ptx_instruction(*pI);
        *((warp_inst_t *)pJ) = inst;  // copy active mask information
        pI = pJ;
      }
      //! try to check mma.load address
      // Add this check for MMA load operations
      if (pI->get_opcode() == MMA_LD_OP) {
        printf("PTX Instruction: %s (wmma::load_matrix_sync) at %s:%u\n",
               pI->get_opcode_cstr(), pI->source_file(), pI->source_line());

        addr_t addr = last_eaddr();
        memory_space_t space = last_space();

        // Function to convert memory_space_t to string
        auto space_to_string = [](const memory_space_t &space) {
          switch (space.get_type()) {
            case undefined_space:
              return "undefined";
            case reg_space:
              return "register";
            case local_space:
              return "local";
            case shared_space:
              return "shared";
            case sstarr_space:
              return "sstarr";
            case param_space_unclassified:
              return "param (unclassified)";
            case param_space_kernel:
              return "param (kernel)";
            case param_space_local:
              return "param (local)";
            case const_space:
              return "constant";
            case tex_space:
              return "texture";
            case surf_space:
              return "surface";
            case global_space:
              return "global";
            case generic_space:
              return "generic";
            case instruction_space:
              return "instruction";
            default:
              return "unknown";
          }
        };

        printf("  MMA Load: Address 0x%llx\n", (unsigned long long)addr);
        printf("  Memory Space: %s\n", space_to_string(space));
        printf("  Memory Bank: %u\n", space.get_bank());

        // // Additional checks based on memory space
        // if (space.is_const()) {
        //   printf("  This is a constant memory access\n");
        // } else if (space.is_local()) {
        //   printf("  This is a local memory access\n");
        // } else if (space.is_global()) {
        //   printf("  This is a global memory access\n");
        // }

        // // Use last_eaddr() to get the memory address
        // addr_t addr = last_eaddr();

        // printf("  Loading from address 0x%llx\n", (unsigned long long)addr);

        // // If you want to print the size of the data being loaded
        // unsigned to_type = pI->get_type();
        // unsigned data_size = datatype2size(to_type);
        // printf("  Data size: %u bytes\n", data_size);
      }

      if (((inst_opcode == MMA_OP || inst_opcode == MMA_LD_OP ||
            inst_opcode == MMA_ST_OP))) {
        if (inst.active_count() != MAX_WARP_SIZE) {
          printf(
              "Tensor Core operation are warp synchronous operation. All the "
              "threads needs to be active.");
          assert(0);
        }
      }

      // Tensorcore is warp synchronous operation. So these instructions needs
      // to be executed only once. To make the simulation faster removing the
      // redundant tensorcore operation
      if (!tensorcore_op(inst_opcode) ||
          ((tensorcore_op(inst_opcode)) && (lane_id == 0))) {
        switch (inst_opcode) {
#define OP_DEF(OP, FUNC, STR, DST, CLASSIFICATION) \
  case OP:                                         \
    FUNC(pI, this);                                \
    op_classification = CLASSIFICATION;            \
    break;
#define OP_W_DEF(OP, FUNC, STR, DST, CLASSIFICATION) \
  case OP:                                           \
    FUNC(pI, get_core(), inst);                      \
    op_classification = CLASSIFICATION;              \
    break;
#include "opcodes.def"
#undef OP_DEF
#undef OP_W_DEF
          default:
            printf("Execution error: Invalid opcode (0x%x)\n",
                   pI->get_opcode());
            break;
        }
      }
      delete pJ;
      pI = pI_saved;

      // Run exit instruction if exit option included
      if (pI->is_exit()) exit_impl(pI, this);
    }

    const gpgpu_functional_sim_config &config = m_gpu->get_config();

    // Output instruction information to file and stdout
    if (config.get_ptx_inst_debug_to_file() != 0 &&
        (config.get_ptx_inst_debug_thread_uid() == 0 ||
         config.get_ptx_inst_debug_thread_uid() == get_uid())) {
      fprintf(m_gpu->get_ptx_inst_debug_file(), "[thd=%u] : (%s:%u - %s)\n",
              get_uid(), pI->source_file(), pI->source_line(),
              pI->get_source());
      // fprintf(ptx_inst_debug_file, "has memory read=%d, has memory
      // write=%d\n", pI->has_memory_read(), pI->has_memory_write());
      fflush(m_gpu->get_ptx_inst_debug_file());
    }

    if (m_gpu->gpgpu_ctx->func_sim->ptx_debug_exec_dump_cond<5>(get_uid(),
                                                                pc)) {
      dim3 ctaid = get_ctaid();
      dim3 tid = get_tid();
      printf(
          "%u [thd=%u][i=%u] : ctaid=(%u,%u,%u) tid=(%u,%u,%u) icount=%u "
          "[pc=%u] (%s:%u - %s)  [0x%llx]\n",
          m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn, get_uid(), pI->uid(),
          ctaid.x, ctaid.y, ctaid.z, tid.x, tid.y, tid.z, get_icount(), pc,
          pI->source_file(), pI->source_line(), pI->get_source(),
          m_last_set_operand_value.u64);
      fflush(stdout);
    }

    addr_t insn_memaddr = 0xFEEBDAED;
    memory_space_t insn_space = undefined_space;
    _memory_op_t insn_memory_op = no_memory_op;
    unsigned insn_data_size = 0;
    if ((pI->has_memory_read() || pI->has_memory_write())) {
      if (!((inst_opcode == MMA_LD_OP || inst_opcode == MMA_ST_OP))) {
        insn_memaddr = last_eaddr();
        insn_space = last_space();
        unsigned to_type = pI->get_type();
        insn_data_size = datatype2size(to_type);
        insn_memory_op = pI->has_memory_read() ? memory_load : memory_store;
        // Function to convert memory_space_t to string
        auto space_to_string = [](const memory_space_t &space) {
          switch (space.get_type()) {
            case undefined_space:
              return "undefined";
            case reg_space:
              return "register";
            case local_space:
              return "local";
            case shared_space:
              return "shared";
            case sstarr_space:
              return "sstarr";
            case param_space_unclassified:
              return "param (unclassified)";
            case param_space_kernel:
              return "param (kernel)";
            case param_space_local:
              return "param (local)";
            case const_space:
              return "constant";
            case tex_space:
              return "texture";
            case surf_space:
              return "surface";
            case global_space:
              return "global";
            case generic_space:
              return "generic";
            case instruction_space:
              return "instruction";
            default:
              return "unknown";
          }
        };

        //* Add this print statement
        if (pI->has_memory_read()) {
          printf(
              "PTX Instruction: %s (at %s:%u) is reading memory. Address: "
              "0x%llx, Size: %u bytes, Space: %s\n",
              pI->get_opcode_cstr(), pI->source_file(), pI->source_line(),
              (unsigned long long)insn_memaddr, insn_data_size,
              space_to_string(insn_space));
        }
      }
    }

    if (pI->get_opcode() == BAR_OP && pI->barrier_op() == RED_OPTION) {
      inst.add_callback(lane_id, last_callback().function,
                        last_callback().instruction, this,
                        false /*not atomic*/);
    }

    if (pI->get_opcode() == ATOM_OP) {
      insn_memaddr = last_eaddr();
      insn_space = last_space();
      inst.add_callback(lane_id, last_callback().function,
                        last_callback().instruction, this, true /*atomic*/);
      unsigned to_type = pI->get_type();
      insn_data_size = datatype2size(to_type);
    }

    if (pI->get_opcode() == TEX_OP) {
      inst.set_addr(lane_id, last_eaddr());
      assert(inst.space == last_space());
      insn_data_size = get_tex_datasize(
          pI,
          this);  // texture obtain its data granularity from the texture info
    }

    // Output register information to file and stdout
    if (config.get_ptx_inst_debug_to_file() != 0 &&
        (config.get_ptx_inst_debug_thread_uid() == 0 ||
         config.get_ptx_inst_debug_thread_uid() == get_uid())) {
      dump_modifiedregs(m_gpu->get_ptx_inst_debug_file());
      dump_regs(m_gpu->get_ptx_inst_debug_file());
    }

    if (g_debug_execution >= 6) {
      if (m_gpu->gpgpu_ctx->func_sim->ptx_debug_exec_dump_cond<6>(get_uid(),
                                                                  pc))
        dump_modifiedregs(stdout);
    }
    if (g_debug_execution >= 10) {
      if (m_gpu->gpgpu_ctx->func_sim->ptx_debug_exec_dump_cond<10>(get_uid(),
                                                                   pc))
        dump_regs(stdout);
    }
    update_pc();
    m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn++;

    // not using it with functional simulation mode
    if (!(this->m_functionalSimulationMode))
      ptx_file_line_stats_add_exec_count(pI);

    if (m_gpu->gpgpu_ctx->func_sim->gpgpu_ptx_instruction_classification) {
      m_gpu->gpgpu_ctx->func_sim->init_inst_classification_stat();
      unsigned space_type = 0;
      switch (pI->get_space().get_type()) {
        case global_space:
          space_type = 10;
          break;
        case local_space:
          space_type = 11;
          break;
        case tex_space:
          space_type = 12;
          break;
        case surf_space:
          space_type = 13;
          break;
        case param_space_kernel:
        case param_space_local:
          space_type = 14;
          break;
        case shared_space:
          space_type = 15;
          break;
        case const_space:
          space_type = 16;
          break;
        default:
          space_type = 0;
          break;
      }
      StatAddSample(m_gpu->gpgpu_ctx->func_sim->g_inst_classification_stat
                        [m_gpu->gpgpu_ctx->func_sim->g_ptx_kernel_count],
                    op_classification);
      if (space_type)
        StatAddSample(m_gpu->gpgpu_ctx->func_sim->g_inst_classification_stat
                          [m_gpu->gpgpu_ctx->func_sim->g_ptx_kernel_count],
                      (int)space_type);
      StatAddSample(m_gpu->gpgpu_ctx->func_sim->g_inst_op_classification_stat
                        [m_gpu->gpgpu_ctx->func_sim->g_ptx_kernel_count],
                    (int)pI->get_opcode());
    }
    if ((m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn % 100000) == 0) {
      dim3 ctaid = get_ctaid();
      dim3 tid = get_tid();
      DPRINTF(LIVENESS,
              "GPGPU-Sim PTX: %u instructions simulated : ctaid=(%u,%u,%u) "
              "tid=(%u,%u,%u)\n",
              m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn, ctaid.x, ctaid.y,
              ctaid.z, tid.x, tid.y, tid.z);
      fflush(stdout);
    }

    // "Return values"
    if (!skip) {
      if (!((inst_opcode == MMA_LD_OP || inst_opcode == MMA_ST_OP))) {
        inst.space = insn_space;
        inst.set_addr(lane_id, insn_memaddr);
        inst.data_size = insn_data_size;  // simpleAtomicIntrinsics
        assert(inst.memory_op == insn_memory_op);
      }
    }

  } catch (int x) {
    printf("GPGPU-Sim PTX: ERROR (%d) executing intruction (%s:%u)\n", x,
           pI->source_file(), pI->source_line());
    printf("GPGPU-Sim PTX:       '%s'\n", pI->get_source());
    abort();
  }
}
Leave a Comment