(mlc-prebuilt) cfruan@catalyst-fleet:/ssd1/cfruan/mlc-llm$ python3 build.py --model=WizardMath-7B-V1.0 --quantization=q4f32_1 --target=webgpu
Using path “dist/models/WizardMath-7B-V1.0” for model “WizardMath-7B-V1.0"
Target configured: webgpu -keys=webgpu,gpu -max_num_threads=256
Automatically using target for weight quantization: cuda -keys=cuda,gpu -arch=sm_89 -max_num_threads=1024 -max_shared_memory_per_block=49152 -max_threads_per_block=1024 -registers_per_block=65536 -thread_warp_size=32
Start computing and quantizing weights... This may take a while.
Finish computing and quantizing weights.
Total param size: 3.9250688552856445 GB
Start storing to cache dist/WizardMath-7B-V1.0-q4f32_1/params
[0327/0327] saving param_326
All finished, 132 total shards committed, record saved to dist/WizardMath-7B-V1.0-q4f32_1/params/ndarray-cache.json
Finish exporting chat config to dist/WizardMath-7B-V1.0-q4f32_1/params/mlc-chat-config.json
[04:00:02] /workspace/tvm/src/relax/ir/block_builder.cc:64: Warning: BlockBuilder destroyed with remaining blocks!
Traceback (most recent call last):
File “/ssd1/cfruan/mlc-llm/build.py”, line 4, in <module>
main()
File “/ssd1/cfruan/mlc-llm/mlc_llm/build.py”, line 10, in main
core.build_model_from_args(parsed_args)
File “/ssd1/cfruan/mlc-llm/mlc_llm/core.py”, line 572, in build_model_from_args
mod = mod_transform_before_build(mod, param_manager, args, config)
File “/ssd1/cfruan/mlc-llm/mlc_llm/core.py”, line 361, in mod_transform_before_build
mod = fuse_split_rotary_embedding(mod, config[“num_attention_heads”], config[“hidden_size”])
File “/ssd1/cfruan/mlc-llm/mlc_llm/transform/fuse_split_rotary_embedding.py”, line 177, in fuse_split_rotary_embedding
mod[“decode”] = rewrite_bindings(ctx, rewriter, mod[“decode”])
File “/home/cfruan/.conda/envs/mlc-prebuilt/lib/python3.10/site-packages/tvm/relax/dpl/rewrite.py”, line 118, in rewrite_bindings
return ffi.rewrite_bindings(ctx, rewriter, func)
File “tvm/_ffi/_cython/./packed_func.pxi”, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
File “tvm/_ffi/_cython/./packed_func.pxi”, line 262, in tvm._ffi._cy3.core.FuncCall
File “tvm/_ffi/_cython/./packed_func.pxi”, line 251, in tvm._ffi._cy3.core.FuncCall3
File “tvm/_ffi/_cython/./base.pxi”, line 181, in tvm._ffi._cy3.core.CHECK_CALL
tvm._ffi.base.TVMError: Traceback (most recent call last):
30: TVMFuncCall
29: _ZN3tvm7runtime13PackedFun
28: tvm::runtime::TypedPackedFunc<tvm::relax::Function (tvm::relax::PatternContext const&, tvm::runtime::PackedFunc, tvm::relax::Function)>::AssignTypedLambda<tvm::relax::Function (*)(tvm::relax::PatternContext const&, tvm::runtime::PackedFunc, tvm::relax::Function)>(tvm::relax::Function (*)(tvm::relax::PatternContext const&, tvm::runtime::PackedFunc, tvm::relax::Function), std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
27: tvm::relax::RewriteBindings(tvm::relax::PatternContext const&, tvm::runtime::PackedFunc, tvm::relax::Function)
26: tvm::relax::Function tvm::relax::PatternRewriter::Run<tvm::relax::PatternContext>(tvm::relax::PatternContext, tvm::runtime::PackedFunc, tvm::relax::Function)
25: tvm::relax::ExprMutator::VisitExpr(tvm::RelayExpr const&)
24: tvm::relax::ExprFunctor<tvm::RelayExpr (tvm::RelayExpr const&)>::VisitExpr(tvm::RelayExpr const&)
23: _ZZN3tvm5relax11ExprFuncto
22: tvm::relax::ExprMutator::VisitExpr_(tvm::relax::FunctionNode const*)
21: tvm::relax::ExprMutator::VisitWithNewScope(tvm::RelayExpr const&, tvm::runtime::Optional<tvm::runtime::Array<tvm::relax::Var, void> >)
20: tvm::relax::ExprMutator::VisitExpr(tvm::RelayExpr const&)
19: tvm::relax::ExprFunctor<tvm::RelayExpr (tvm::RelayExpr const&)>::VisitExpr(tvm::RelayExpr const&)
18: _ZZN3tvm5relax11ExprFuncto
17: tvm::relax::ExprMutator::VisitExpr_(tvm::relax::SeqExprNode const*)
16: tvm::relax::ExprMutator::VisitBindingBlock(tvm::relax::BindingBlock const&)
15: tvm::relax::PatternRewriter::VisitBindingBlock_(tvm::relax::DataflowBlockNode const*)
14: tvm::relax::PatternRewriter::RewriteDataflowBlockFixedPoint(tvm::relax::BindingBlock)
13: tvm::relax::PatternRewriter::VisitBinding_(tvm::relax::VarBindingNode const*)
12: tvm::relax::ExprMutator::VisitBinding_(tvm::relax::VarBindingNode const*, tvm::relax::DataTypeImmNode const*)
11: tvm::relax::ExprMutator::VisitExpr(tvm::RelayExpr const&)
10: tvm::relax::Normalizer::Normalize(tvm::RelayExpr const&)
9: tvm::relax::ExprFunctor<tvm::RelayExpr (tvm::RelayExpr const&)>::VisitExpr(tvm::RelayExpr const&)
8: _ZZN3tvm5relax11ExprFuncto
7: tvm::relax::Normalizer::VisitExpr_(tvm::relax::CallNode const*)
6: tvm::relax::Normalizer::InferStructInfo(tvm::relax::Call const&)
5: _ZN3tvm7runtime13PackedFun
4: tvm::runtime::TypedPackedFunc<tvm::relax::StructInfo (tvm::relax::Call const&, tvm::relax::BlockBuilder const&)>::AssignTypedLambda<tvm::relax::StructInfo (*)(tvm::relax::Call const&, tvm::relax::BlockBuilder const&)>(tvm::relax::StructInfo (*)(tvm::relax::Call const&, tvm::relax::BlockBuilder const&))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
3: tvm::relax::InferStructInfoMatmul(tvm::relax::Call const&, tvm::relax::BlockBuilder const&)
2: tvm::relax::InferBinaryArithOpOutDtype(tvm::relax::Call const&, tvm::relax::BlockBuilder const&, tvm::relax::TensorStructInfo const&, tvm::relax::TensorStructInfo const&)
1: _ZN3tvm5relax16BlockBuilderImpl11ReportFatalERKNS_1
0: _ZN3tvm7runtime6deta
File “/workspace/tvm/src/relax/ir/block_builder.cc”, line 138
TVMError: Data types float16 and float32 must be equal for binary operators
(mlc-prebuilt) cfruan@catalyst-fleet:/ssd1/cfruan/mlc-llm$ python3 build.py --model=WizardMath-7B-V1.0 --quantization=q4f32_1 --target=cuda
Using path “dist/models/WizardMath-7B-V1.0" for model “WizardMath-7B-V1.0”
Target configured: cuda -keys=cuda,gpu -arch=sm_89 -max_num_threads=1024 -max_shared_memory_per_block=49152 -max_threads_per_block=1024 -registers_per_block=65536 -thread_warp_size=32
Automatically using target for weight quantization: cuda -keys=cuda,gpu -arch=sm_89 -max_num_threads=1024 -max_shared_memory_per_block=49152 -max_threads_per_block=1024 -registers_per_block=65536 -thread_warp_size=32
Start computing and quantizing weights... This may take a while.
Finish computing and quantizing weights.
Total param size: 3.9250688552856445 GB
Start storing to cache dist/WizardMath-7B-V1.0-q4f32_1/params
[0327/0327] saving param_326
All finished, 132 total shards committed, record saved to dist/WizardMath-7B-V1.0-q4f32_1/params/ndarray-cache.json
Finish exporting chat config to dist/WizardMath-7B-V1.0-q4f32_1/params/mlc-chat-config.json
[04:02:05] /workspace/tvm/src/relax/ir/block_builder.cc:64: Warning: BlockBuilder destroyed with remaining blocks!
Traceback (most recent call last):
File “/ssd1/cfruan/mlc-llm/build.py”, line 4, in <module>
main()
File “/ssd1/cfruan/mlc-llm/mlc_llm/build.py”, line 10, in main
core.build_model_from_args(parsed_args)
File “/ssd1/cfruan/mlc-llm/mlc_llm/core.py”, line 572, in build_model_from_args
mod = mod_transform_before_build(mod, param_manager, args, config)
File “/ssd1/cfruan/mlc-llm/mlc_llm/core.py”, line 361, in mod_transform_before_build
mod = fuse_split_rotary_embedding(mod, config[“num_attention_heads”], config[“hidden_size”])
File “/ssd1/cfruan/mlc-llm/mlc_llm/transform/fuse_split_rotary_embedding.py”, line 177, in fuse_split_rotary_embedding
mod[“decode”] = rewrite_bindings(ctx, rewriter, mod[“decode”])
File “/home/cfruan/.conda/envs/mlc-prebuilt/lib/python3.10/site-packages/tvm/relax/dpl/rewrite.py”, line 118, in rewrite_bindings
return ffi.rewrite_bindings(ctx, rewriter, func)
File “tvm/_ffi/_cython/./packed_func.pxi”, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
File “tvm/_ffi/_cython/./packed_func.pxi”, line 262, in tvm._ffi._cy3.core.FuncCall
File “tvm/_ffi/_cython/./packed_func.pxi”, line 251, in tvm._ffi._cy3.core.FuncCall3
File “tvm/_ffi/_cython/./base.pxi”, line 181, in tvm._ffi._cy3.core.CHECK_CALL
tvm._ffi.base.TVMError: Traceback (most recent call last):
30: TVMFuncCall
29: _ZN3tvm7runtime13PackedFun
28: tvm::runtime::TypedPackedFunc<tvm::relax::Function (tvm::relax::PatternContext const&, tvm::runtime::PackedFunc, tvm::relax::Function)>::AssignTypedLambda<tvm::relax::Function (*)(tvm::relax::PatternContext const&, tvm::runtime::PackedFunc, tvm::relax::Function)>(tvm::relax::Function (*)(tvm::relax::PatternContext const&, tvm::runtime::PackedFunc, tvm::relax::Function), std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
27: tvm::relax::RewriteBindings(tvm::relax::PatternContext const&, tvm::runtime::PackedFunc, tvm::relax::Function)
26: tvm::relax::Function tvm::relax::PatternRewriter::Run<tvm::relax::PatternContext>(tvm::relax::PatternContext, tvm::runtime::PackedFunc, tvm::relax::Function)
25: tvm::relax::ExprMutator::VisitExpr(tvm::RelayExpr const&)
24: tvm::relax::ExprFunctor<tvm::RelayExpr (tvm::RelayExpr const&)>::VisitExpr(tvm::RelayExpr const&)
23: _ZZN3tvm5relax11ExprFuncto
22: tvm::relax::ExprMutator::VisitExpr_(tvm::relax::FunctionNode const*)
21: tvm::relax::ExprMutator::VisitWithNewScope(tvm::RelayExpr const&, tvm::runtime::Optional<tvm::runtime::Array<tvm::relax::Var, void> >)
20: tvm::relax::ExprMutator::VisitExpr(tvm::RelayExpr const&)
19: tvm::relax::ExprFunctor<tvm::RelayExpr (tvm::RelayExpr const&)>::VisitExpr(tvm::RelayExpr const&)
18: _ZZN3tvm5relax11ExprFuncto
17: tvm::relax::ExprMutator::VisitExpr_(tvm::relax::SeqExprNode const*)
16: tvm::relax::ExprMutator::VisitBindingBlock(tvm::relax::BindingBlock const&)
15: tvm::relax::PatternRewriter::VisitBindingBlock_(tvm::relax::DataflowBlockNode const*)
14: tvm::relax::PatternRewriter::RewriteDataflowBlockFixedPoint(tvm::relax::BindingBlock)
13: tvm::relax::PatternRewriter::VisitBinding_(tvm::relax::VarBindingNode const*)
12: tvm::relax::ExprMutator::VisitBinding_(tvm::relax::VarBindingNode const*, tvm::relax::DataTypeImmNode const*)
11: tvm::relax::ExprMutator::VisitExpr(tvm::RelayExpr const&)
10: tvm::relax::Normalizer::Normalize(tvm::RelayExpr const&)
9: tvm::relax::ExprFunctor<tvm::RelayExpr (tvm::RelayExpr const&)>::VisitExpr(tvm::RelayExpr const&)
8: _ZZN3tvm5relax11ExprFuncto
7: tvm::relax::Normalizer::VisitExpr_(tvm::relax::CallNode const*)
6: tvm::relax::Normalizer::InferStructInfo(tvm::relax::Call const&)
5: _ZN3tvm7runtime13PackedFun
4: tvm::runtime::TypedPackedFunc<tvm::relax::StructInfo (tvm::relax::Call const&, tvm::relax::BlockBuilder const&)>::AssignTypedLambda<tvm::relax::StructInfo (*)(tvm::relax::Call const&, tvm::relax::BlockBuilder const&)>(tvm::relax::StructInfo (*)(tvm::relax::Call const&, tvm::relax::BlockBuilder const&))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
3: tvm::relax::InferStructInfoMatmul(tvm::relax::Call const&, tvm::relax::BlockBuilder const&)
2: tvm::relax::InferBinaryArithOpOutDtype(tvm::relax::Call const&, tvm::relax::BlockBuilder const&, tvm::relax::TensorStructInfo const&, tvm::relax::TensorStructInfo const&)
1: _ZN3tvm5relax16BlockBuilderImpl11ReportFatalERKNS_1
0: _ZN3tvm7runtime6deta
File “/workspace/tvm/src/relax/ir/block_builder.cc”, line 138
TVMError: Data types float16 and float32 must be equal for binary operators