Untitled

import onnx
import numpy as np
import torch
import cutlass
from cutlass.epilogue import relu
from cutlass import Tensor as FakeTensor
from cutlass.utils.profiler import CUDAEventProfiler
from transformers import BertTokenizer
import time
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

plan = cutlass.op.Gemm(
    element=torch.float16,
    layout=cutlass.LayoutType.RowMajor,
    element_accumulator=torch.float32,
    cc=80
)

def example_epilogue(accum, C):
    D = accum + C
    return D

def example_epilogue2(accum, C, F):
    D = accum + C + F
    return D

class ONNXCompiler:
    def __init__(self, model_path):
        # 加载 ONNX 模型
        self.model = onnx.load(model_path)
        self.graph = self.model.graph
        self.nodes = self.graph.node  # 初始化 nodes 屬性
        self.processed_nodes = set()  # 用於跟踪已處理的 Add 節點
        self.matmul_add_fuse_node = set() # 紀錄有 fuse 的 node

        for input_tensor in self.graph.input:
            print(f"Model Input Name: {input_tensor.name}, Shape: {[dim.dim_value for dim in input_tensor.type.tensor_type.shape.dim]}")

        # 提取初始化张量
        self.initializers = {}
        for tensor in self.graph.initializer:
            name = tensor.name
            array = onnx.numpy_helper.to_array(tensor)  # 将 TensorProto 转换为 NumPy 数组
            self.initializers[name] = array

        self.tensors = self.initializers.copy()

    def _execute_node(self, node):
        op_type = node.op_type
        inputs = []

        # 收集输入张量
        for input_name in node.input:
            if input_name in self.tensors:
                inputs.append(self.tensors[input_name])
            else:
                print(f"Warning: Missing input tensor '{input_name}' for node '{node.name}'.")
                return None
            
        if op_type == "MatMul":

            # 提取 MatMul 輸入
            A = inputs[0]
            B = inputs[1]
            original_shape_A = A.shape
            original_shape_B = B.shape
            # print("shape: ")
            # print(original_shape_A, original_shape_B)
            # 處理多維情況（批量矩陣乘法）
            if A.ndim > 2 or B.ndim > 2:
                batch_dims = np.prod(original_shape_A[:-2])
                A = A.reshape(batch_dims, original_shape_A[-2], original_shape_A[-1])
                B = B.reshape(batch_dims, original_shape_B[-2], original_shape_B[-1])

            M, K = A.shape[-2], A.shape[-1]
            K, N = B.shape[-2], B.shape[-1]

            tensor_A = torch.tensor(A, dtype=torch.float16, device="cuda").contiguous()
            tensor_B = torch.tensor(B, dtype=torch.float16, device="cuda").contiguous()

            # 查找與 MatMul 輸出相關的 Add 節點
            matmul_output_name = node.output[0]
            related_add_nodes = [
                n for n in self.nodes if n.op_type == "Add" and matmul_output_name in n.input
            ]

            # related_add_nodes = None

            if not related_add_nodes:
                # 情況 3：MatMul 後無 Add
                print(f"No Add node related to MatMul output: {node.name}. Executing regular MatMul.")
                result = torch.matmul(tensor_A, tensor_B)
                result = result.cpu().numpy()
                if len(original_shape_A) > 2:
                    result = result.reshape(*original_shape_A[:-2], M, N)
                return result

            # 情況 1 或 2：MatMul 輸出為 Add，檢查 Add 的輸出是否也為 Add
            self.matmul_add_fuse_node.add(node.name)

            add_node = related_add_nodes[0]  # 第一層 Add
            add_input_name = [name for name in add_node.input if name != matmul_output_name][0]
            add_input = self.tensors.get(add_input_name)

            if add_input is not None:
                tensor_add_input = torch.tensor(add_input, dtype=torch.float16, device="cuda").contiguous()
                if tensor_add_input.ndim == 1:
                    if tensor_add_input.shape[0] == M:
                        tensor_add_input = tensor_add_input.unsqueeze(1).expand(M, N)
                    elif tensor_add_input.shape[0] == N:
                        tensor_add_input = tensor_add_input.unsqueeze(0).expand(M, N)

                # 檢查第一層 Add 的輸出是否也作為另一個 Add 的輸入
                add_output_name = add_node.output[0]
                next_add_nodes = [
                    n for n in self.nodes if n.op_type == "Add" and add_output_name in n.input
                ]

                if next_add_nodes:
                    # 情況 2：兩層 Add 融合
                    next_add_node = next_add_nodes[0]
                    next_add_input_name = [
                        name for name in next_add_node.input if name != add_output_name
                    ][0]
                    next_add_input = self.tensors.get(next_add_input_name)

                    if next_add_input is not None:
                        print(f"Fusing MatMul with 2Add for Node: {node.name}")
                        tensor_next_add_input = torch.tensor(next_add_input, dtype=torch.float16, device="cuda").contiguous()
                    
                        # result = torch.matmul(tensor_A, tensor_B) + tensor_add_input + tensor_next_add_input
                        # result = result.cpu().numpy()

                        tensor_D = torch.zeros_like(tensor_add_input)

                        examples_tensors = {
                            "accum": FakeTensor(element=torch.float32, shape=(M, N), layout_tag=cutlass.LayoutType.RowMajor),
                            "C": tensor_add_input,
                            "F": tensor_next_add_input,
                            "D": tensor_D
                        }

                        epilogue_visitor = cutlass.epilogue.trace(example_epilogue2, examples_tensors)
                        epilogue_visitor.epilogue_stages = 1
                        
                        visitor_args = {
                            "C": tensor_add_input, "F": tensor_next_add_input, "D": tensor_D
                        }

                        plan.epilogue_visitor = epilogue_visitor
                        plan.run(
                            tensor_A, tensor_B, tensor_add_input, tensor_D,
                            visitor_args=visitor_args, print_module=False
                        )

                        result = tensor_D.cpu().numpy()

                        if len(original_shape_A) > 2:
                            result = result.reshape(*original_shape_A[:-2], M, N)
                        self.processed_nodes.add(next_add_node.name)
                        self.processed_nodes.add(add_node.name)
                        self.tensors[add_node.output[0]] = result
                        self.tensors[next_add_node.output[0]] = result
                        return result

                # 情況 1：一層 Add 融合
                
                print(f"Fusing MatMul with Add for Node: {node.name}")
                # result = torch.matmul(tensor_A, tensor_B) + tensor_add_input
                # result = result.cpu().numpy()

                # tensor_add_input = torch.ceil(torch.empty(size=(M, N), dtype=torch.float16, device="cuda").uniform_(-4, 4))

                tensor_D = torch.zeros_like(tensor_add_input)

                examples_tensors = {
                    "accum": FakeTensor(element=torch.float32, shape=(M, N), layout_tag=cutlass.LayoutType.RowMajor),
                    "C": tensor_add_input,
                    "D": tensor_D
                }

                epilogue_visitor = cutlass.epilogue.trace(example_epilogue, examples_tensors)
                epilogue_visitor.epilogue_stages = 1
                
                visitor_args = {
                    "C": tensor_add_input, "D": tensor_D
                }

                plan.epilogue_visitor = epilogue_visitor
                plan.run(
                    tensor_A, tensor_B, tensor_add_input, tensor_D,
                    visitor_args=visitor_args, print_module=False
                )

                result = tensor_D.cpu().numpy()
                
                if len(original_shape_A) > 2:
                    result = result.reshape(*original_shape_A[:-2], M, N)
                self.processed_nodes.add(add_node.name)
                self.tensors[add_node.output[0]] = result
                return result
                
        # 運算邏輯
        elif op_type == "Slice":
            data = inputs[0]
            starts = inputs[1] if len(inputs) > 1 else np.zeros(data.ndim, dtype=np.int64)
            ends = inputs[2] if len(inputs) > 2 else np.array(data.shape, dtype=np.int64)
            axes = inputs[3] if len(inputs) > 3 else np.arange(data.ndim, dtype=np.int64)
            steps = inputs[4] if len(inputs) > 4 else np.ones_like(starts, dtype=np.int64)

            axes = np.array(axes, dtype=np.int64)
            slices = [slice(None)] * data.ndim

            for start, end, axis, step in zip(starts, ends, axes, steps):
                axis = int(axis)
                dim = data.shape[axis]
                start = int(start + dim if start < 0 else start)
                end = int(end + dim if end < 0 else end)
                start = np.clip(start, 0, dim)
                end = np.clip(end, 0, dim + 1 if step > 0 else dim)

                slices[axis] = slice(start, end, int(step))

            return data[tuple(slices)]

        elif op_type == "Add":
            # 如果 Add 節點已處理，直接返回結果
            if node.name in self.processed_nodes:
                print(f"Skipping already processed Add Node: {node.name}")
                return inputs[0]
            self.processed_nodes.add(node.name)
            A = torch.tensor(inputs[0], device="cuda")
            B = torch.tensor(inputs[1], device="cuda")
            result = (A + B).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result

        elif op_type == "Sub":
            A = torch.tensor(inputs[0], device="cuda")
            B = torch.tensor(inputs[1], device="cuda")
            result = (A - B).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result
        elif op_type == "Mul":
            # 確保形狀兼容，否則調整形狀
            try:
                A = torch.tensor(inputs[0], device="cuda")
                B = torch.tensor(inputs[1], device="cuda")
                result = (A * B).cpu().numpy()
                self.tensors[node.output[0]] = result
            except ValueError:
                # 嘗試廣播形狀
                print("Broadcasting shapes for Mul operation...")
                inputs[1] = np.broadcast_to(inputs[1], inputs[0].shape)
                A = torch.tensor(inputs[0], device="cuda")
                B = torch.tensor(inputs[1], device="cuda")
                result = (A * B).cpu().numpy()
                self.tensors[node.output[0]] = result
            return result

        elif op_type == "Div":
            A = torch.tensor(inputs[0], device="cuda")
            B = torch.tensor(inputs[1], device="cuda")
            result = (A / B).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result
        elif op_type == "Sqrt":
            A = torch.tensor(inputs[0], device="cuda")
            result = torch.sqrt(A).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result
        elif op_type == "Reciprocal":
            A = torch.tensor(inputs[0], device="cuda")
            result = (1 / A).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result
        elif op_type == "Shape":
            result = np.array(inputs[0].shape, dtype=np.int64)
            self.tensors[node.output[0]] = result
            return result
        elif op_type == "Transpose":
            A = torch.tensor(inputs[0], device="cuda")
            perm = [attr.ints for attr in node.attribute if attr.name == "perm"]
            if not perm:
                perm = list(range(A.ndim))[::-1]
            else:
                perm = perm[0]
            result = A.permute(*perm).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result

        elif op_type == "Reshape":
            if len(inputs) > 1:
                shape = inputs[1].astype(np.int64)
            else:
                shape = np.array(node.attribute[0].ints, dtype=np.int64)
            return np.reshape(inputs[0], shape)

        elif op_type == "Concat":
            axis = self.tensors[node.input[1]] if len(node.input) > 1 and node.input[1] in self.tensors else 0
            if isinstance(axis, np.ndarray):
                axis = axis.item()
            axis = int(axis)
            return np.concatenate(inputs, axis=axis)
        elif op_type == "Squeeze":
            axes = self.tensors[node.input[1]] if len(node.input) > 1 else None
            if axes is not None:
                axes = np.array(axes, dtype=int)
                valid_axes = [axis for axis in axes if inputs[0].shape[axis] == 1]
                if not valid_axes:
                    raise ValueError("Cannot squeeze axes that do not have size equal to one.")
                return np.squeeze(inputs[0], axis=tuple(valid_axes))
            else:
                return np.squeeze(inputs[0])
        elif op_type == "Unsqueeze":
            axes = self.tensors[node.input[1]] if len(node.input) > 1 else []
            if isinstance(axes, np.ndarray):
                axes = axes.tolist()
            return np.expand_dims(inputs[0], axis=tuple(axes))
        elif op_type == "Identity":
            return inputs[0]
        elif op_type == "Tanh":
            A = torch.tensor(inputs[0], device="cuda")
            result = torch.tanh(A).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result
        elif op_type == "Sigmoid":
            A = torch.tensor(inputs[0], device="cuda")
            result = torch.sigmoid(A).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result
        elif op_type == "Relu":
            A = torch.tensor(inputs[0], device="cuda")
            result = torch.relu(A).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result
        elif op_type == "Pow":
            A = torch.tensor(inputs[0], device="cuda")
            B = torch.tensor(inputs[1], device="cuda")
            result = torch.pow(A, B).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result
        elif op_type == "Gather":
            data = inputs[0]
            indices = inputs[1]
            axis = self.tensors[node.input[2]] if len(node.input) > 2 and node.input[2] in self.tensors else 0
            return np.take(data, indices, axis=axis)
        elif op_type == "ReduceMean":
            A = torch.tensor(inputs[0], device="cuda")
            axes = inputs[1] if len(inputs) > 1 else None
            keepdims = inputs[2] if len(inputs) > 2 else True
            result = torch.mean(A, dim=axes, keepdim=keepdims).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result
        
        elif op_type == "Cast":
            dtype_map = {
                1: np.float32,   # FLOAT
                2: np.uint8,     # UINT8
                3: np.int8,      # INT8
                4: np.uint16,    # UINT16
                5: np.int16,     # INT16
                6: np.int32,     # INT32
                7: np.int64,     # INT64
                8: str,          # STRING
                9: np.bool_,     # BOOL
                10: np.float16,  # FLOAT16
                11: np.double,   # DOUBLE
                12: np.uint32,   # UINT32
                13: np.uint64,   # UINT64
            }
            target_type = node.attribute[0].i if node.attribute else None
            if target_type not in dtype_map:
                raise NotImplementedError(f"Unsupported target type {target_type} for Cast operation.")
            return inputs[0].astype(dtype_map[target_type])
        elif op_type == "ConstantOfShape":
            shape = inputs[0].astype(np.int64)  # 這裡輸入是形狀數據
            value = node.attribute[0].t if node.attribute else 0  # 獲取常數值，預設為 0
            # 直接將常數值轉換為 NumPy 格式
            constant_value = np.frombuffer(value.raw_data, dtype=np.float32) if value else np.array(0, dtype=np.float32)
            return np.full(shape, constant_value, dtype=constant_value.dtype)
        elif op_type == "OneHot":
            # 提取輸入數據
            indices = inputs[0]  # 索引數據
            depth = int(inputs[1])  # one-hot 深度
            values = inputs[2] if len(inputs) > 2 else np.array([0, 1], dtype=np.float32)  # one-hot 值
            axis = next((attr.i for attr in node.attribute if attr.name == "axis"), -1)  # 默認 -1（最後一個軸）

            # 建立 one-hot 編碼
            eye_matrix = np.eye(depth, dtype=values.dtype)  # 深度對應的單位矩陣
            one_hot = eye_matrix[indices.reshape(-1)]  # 根據索引生成 one-hot 張量

            # 將 one-hot 編碼調整為指定的軸位置
            if axis == -1:
                result = one_hot
            else:
                result = np.moveaxis(one_hot, -1, axis)  # 移動 one-hot 軸到指定位置

            # 使用 values[0] 和 values[1] 替換默認的 0 和 1
            result = result * (values[1] - values[0]) + values[0]
            return result
        elif op_type == "Softmax":
            A = torch.tensor(inputs[0], device="cuda")
            axis = node.attribute[0].i if node.attribute else -1
            result = torch.softmax(A, dim=axis).cpu().numpy()
            self.tensors[node.output[0]] = result
            return result
        
        elif op_type == "Split":
            
            # 提取輸入數據
            input_data = inputs[0]
            
            # 從屬性中獲取 axis 和 split
            axis = next((attr.i for attr in node.attribute if attr.name == "axis"), 0)
            split = next((attr.ints for attr in node.attribute if attr.name == "split"), None)
            
            # 如果未指定 split，均勻分割
            if split is None:
                num_splits = len(node.output)
                if input_data.shape[axis] % num_splits != 0:
                    raise ValueError(f"Cannot evenly split axis {axis} into {num_splits} parts.")
                split_size = input_data.shape[axis] // num_splits
                split = [split_size] * num_splits
            
            # print(f"Input Shape: {input_data.shape}, Axis: {axis}, Split Sizes: {split}")
            
            # 執行分割
            result = np.split(input_data, np.cumsum(split[:-1]), axis=axis)
            
            # 確保輸出對應於節點的輸出名稱
            for i, output_name in enumerate(node.output):
                self.tensors[output_name] = result[i]
                # print(f"Output {i} Shape: {result[i].shape}")
            return

        else:
            raise NotImplementedError(f"Operation {op_type} not implemented")

    def execute(self, inputs):
        # 確保 inputs 被添加到張量字典中
        for input_name, input_value in inputs.items():
            self.tensors[input_name] = input_value
        # 打印所有輸入的詳細資訊
        print("\nInputs Details:")
        for input_name, input_value in inputs.items():
            print(f"Input Name: {input_name}")
            print(f"Shape: {input_value.shape if isinstance(input_value, np.ndarray) else 'N/A'}")
            if isinstance(input_value, np.ndarray):
                print(f"Data (first 10 values): {input_value.flatten()[:10]}...")
            else:
                print(f"Data: {input_value}")
            print("-" * 50)

        execution_order = []
        node_execution_times = {}  # 用於記錄每個節點的執行時間
        total_execution_time = 0.0  # 累計所有節點的執行時間
        total_fuse_execution_time = 0.0
        total_matmul_add_time = 0.0

        for node in self.nodes:
            # print(f"Executing node: {node.name}")
            # 檢查節點的所有輸入是否已準備好
            ready = all(inp in self.tensors for inp in node.input)
            if ready:
                if node.name in self.processed_nodes:
                    # print(f"Skipping already processed Add Node: {node.name}")
                    output_name = node.output[0]
                    # 假設 _execute_node 已實現
                    self.tensors[output_name] = self._execute_node(node)
                else:
                    node_start_time = time.time()  # 節點開始執行時間
                    output_name = node.output[0]
                    # 假設 _execute_node 已實現
                    self.tensors[output_name] = self._execute_node(node)
                    node_end_time = time.time()  # 節點結束執行時間

                    # 記錄執行時間
                    execution_time = node_end_time - node_start_time
                    node_execution_times[node.name] = execution_time
                    total_execution_time += execution_time  # 累計執行時間
                    # if node.name in self.matmul_add_fuse_node:
                    #     print(f"MatMul Fuse Node: {node.name}, Execution Time: {execution_time:.6f} seconds\n")
                    # elif node.op_type == "MatMul":
                    #     print(f"MatMul Node: {node.name}, Execution Time: {execution_time:.6f} seconds\n")
                    # elif node.op_type == "Add":
                    #     print(f"Add Node: {node.name}, Execution Time: {execution_time:.6f} seconds\n")
                    if node.op_type == "MatMul" or node.op_type == "Add":
                        total_matmul_add_time += execution_time

                execution_order.append(node)
            else:
                print(f"Skipping node '{node.name}' due to missing inputs.")

        # 打印所有節點的執行時間
        print("\nNode Execution Times:")
        for node_name, exec_time in node_execution_times.items():
            if node_name in self.matmul_add_fuse_node:
                print(f"Matmul Fuse Node: {node_name}, Execution Time: {exec_time:.6f} seconds")
                total_fuse_execution_time += exec_time
            else:
                print(f"Node: {node_name}, Execution Time: {exec_time:.6f} seconds")

        # 打印所有節點的總執行時間
        print(f"\nTotal Execution Time: {total_execution_time:.6f} seconds")
        print(f"\nTotal Matmul Fuse Execution Time: {total_fuse_execution_time:.6f} seconds")
        print(f"\nTotal Matmul + Add Execution Time: {total_matmul_add_time:.6f} seconds")

        # 收集所有輸出的張量
        outputs = {o.name: self.tensors[o.name] for o in self.graph.output if o.name in self.tensors}
        return outputs, execution_order


def main():
    compiler = ONNXCompiler("model/bertsquad-10_simplified.onnx")
    # 示例问题和上下文
    question = "What is the capital of France?"
    context = "The capital of France is Paris."

    # 分词
    inputs = tokenizer(question, context, return_tensors='np', padding='max_length', max_length=256, truncation=True)

    # 提取输入数据
    # input_ids = inputs['input_ids'].astype(np.int64)
    # segment_ids = inputs['token_type_ids'].astype(np.int64)
    # input_mask = inputs['attention_mask'].astype(np.int64)
    # unique_ids_raw_output = np.array([0], dtype=np.int64)

    input_ids = np.random.randint(0, 30522, size=(1, 256), dtype=np.int64)
    segment_ids = np.random.randint(0, 2, size=(1, 256), dtype=np.int64)
    input_mask = np.random.randint(0, 2, size=(1, 256), dtype=np.int64)
    unique_ids_raw_output = np.random.randint(0, 2, size=(0), dtype=np.int64)

    try:
        print("Starting model execution...")
        start_time = time.time()  # 計算開始時間
        outputs, execution_order = compiler.execute({
            "input_ids:0": input_ids,
            "segment_ids:0": segment_ids,
            "input_mask:0": input_mask,
            "unique_ids_raw_output___9:0": unique_ids_raw_output
        })
        end_time = time.time()  # 計算結束時間
        
        print("Execution complete.")
        print(f"\nTotal execution time: {end_time - start_time:.6f} seconds")  # 打印總執行時間

        # print("Model outputs:", outputs)
        # print("Execution order:", [node.name for node in execution_order])
    except ValueError as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()
Editor is loading...