Add/update the quantized ONNX model files and README.md for Transformers.js v3
#1
by
whitphx
HF Staff
- opened
Applied Quantizations
❌ Based on decoder_model.onnx
with slimming
0%| | 0/1 [00:00<?, ?it/s]
Processing /tmp/tmp73cu4bel/decoder_model.onnx: 0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/7 [00:00<?, ?it/s][A
- Quantizing to fp16: 0%| | 0/7 [00:00<?, ?it/s][A/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py:85: UserWarning: the float32 number -3.4028234663852886e+38 will be truncated to -10000.0
warnings.warn(
- Quantizing to fp16: 0%| | 0/7 [00:00<?, ?it/s]
Processing /tmp/tmp73cu4bel/decoder_model.onnx: 0%| | 0/1 [00:00<?, ?it/s]
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 377, in <module>
main()
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 374, in main
quantize(input_folder, output_folder, quantization_args)
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 309, in quantize
quantize_fp16(
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 217, in quantize_fp16
model_fp16 = float16.convert_float_to_float16(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py", line 273, in convert_float_to_float16
process_graph_output(curr_graph, is_top_level, keep_io_types)
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py", line 372, in process_graph_output
assert len(upstream_nodes) == 1 # Should be only one node
^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
❌ Based on decoder_model.onnx
without slimming
0%| | 0/1 [00:00<?, ?it/s]
Processing /tmp/tmp7pqh64tg/decoder_model.onnx: 0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/7 [00:00<?, ?it/s][A
- Quantizing to fp16: 0%| | 0/7 [00:00<?, ?it/s][A/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py:85: UserWarning: the float32 number -3.4028234663852886e+38 will be truncated to -10000.0
warnings.warn(
- Quantizing to fp16: 0%| | 0/7 [00:00<?, ?it/s]
Processing /tmp/tmp7pqh64tg/decoder_model.onnx: 0%| | 0/1 [00:00<?, ?it/s]
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 377, in <module>
main()
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 374, in main
quantize(input_folder, output_folder, quantization_args)
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 309, in quantize
quantize_fp16(
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 217, in quantize_fp16
model_fp16 = float16.convert_float_to_float16(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py", line 273, in convert_float_to_float16
process_graph_output(curr_graph, is_top_level, keep_io_types)
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py", line 372, in process_graph_output
assert len(upstream_nodes) == 1 # Should be only one node
^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
❌ Based on encoder_model.onnx
with slimming
None
↳ ✅ fp16
: encoder_model_fp16.onnx
(added)
↳ ✅ q8
: encoder_model_quantized.onnx
(added)
↳ ❌ int8
: encoder_model_int8.onnx
(added but JS-based E2E test failed)
dtype not specified for "decoder_model_merged". Using the default dtype (fp32) for this device (cpu).
/home/ubuntu/src/tjsmigration/node_modules/.pnpm/onnxruntime-node@1.21.0/node_modules/onnxruntime-node/dist/backend.js:25
__classPrivateFieldGet(this, _OnnxruntimeSessionHandler_inferenceSession, "f").loadModel(pathOrBuffer, options);
^
Error: Could not find an implementation for ConvInteger(10) node with name '/conv1/Conv_quant'
at new OnnxruntimeSessionHandler (/home/ubuntu/src/tjsmigration/node_modules/.pnpm/onnxruntime-node@1.21.0/node_modules/onnxruntime-node/dist/backend.js:25:92)
at Immediate.<anonymous> (/home/ubuntu/src/tjsmigration/node_modules/.pnpm/onnxruntime-node@1.21.0/node_modules/onnxruntime-node/dist/backend.js:67:29)
at process.processImmediate (node:internal/timers:485:21)
Node.js v22.16.0
↳ ✅ uint8
: encoder_model_uint8.onnx
(added)
↳ ✅ q4
: encoder_model_q4.onnx
(added)
↳ ✅ q4f16
: encoder_model_q4f16.onnx
(added)
↳ ✅ bnb4
: encoder_model_bnb4.onnx
(added)
❌ Based on encoder_model.onnx
with slimming
None
↳ ✅ fp16
: encoder_model_fp16.onnx
(added)
↳ ✅ q8
: encoder_model_quantized.onnx
(added)
↳ ❌ int8
: encoder_model_int8.onnx
(added but JS-based E2E test failed)
dtype not specified for "decoder_model_merged". Using the default dtype (fp32) for this device (cpu).
/home/ubuntu/src/tjsmigration/node_modules/.pnpm/onnxruntime-node@1.21.0/node_modules/onnxruntime-node/dist/backend.js:25
__classPrivateFieldGet(this, _OnnxruntimeSessionHandler_inferenceSession, "f").loadModel(pathOrBuffer, options);
^
Error: Could not find an implementation for ConvInteger(10) node with name '/conv1/Conv_quant'
at new OnnxruntimeSessionHandler (/home/ubuntu/src/tjsmigration/node_modules/.pnpm/onnxruntime-node@1.21.0/node_modules/onnxruntime-node/dist/backend.js:25:92)
at Immediate.<anonymous> (/home/ubuntu/src/tjsmigration/node_modules/.pnpm/onnxruntime-node@1.21.0/node_modules/onnxruntime-node/dist/backend.js:67:29)
at process.processImmediate (node:internal/timers:485:21)
Node.js v22.16.0
↳ ✅ uint8
: encoder_model_uint8.onnx
(added)
↳ ✅ q4
: encoder_model_q4.onnx
(added)
↳ ✅ q4f16
: encoder_model_q4f16.onnx
(added)
↳ ✅ bnb4
: encoder_model_bnb4.onnx
(added)
❌ Based on decoder_with_past_model.onnx
with slimming
0%| | 0/1 [00:00<?, ?it/s]
Processing /tmp/tmpmaahe0ic/decoder_with_past_model.onnx: 0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/7 [00:00<?, ?it/s][A
- Quantizing to fp16: 0%| | 0/7 [00:00<?, ?it/s][A/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py:85: UserWarning: the float32 number -3.4028234663852886e+38 will be truncated to -10000.0
warnings.warn(
- Quantizing to fp16: 0%| | 0/7 [00:00<?, ?it/s]
Processing /tmp/tmpmaahe0ic/decoder_with_past_model.onnx: 0%| | 0/1 [00:00<?, ?it/s]
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 377, in <module>
main()
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 374, in main
quantize(input_folder, output_folder, quantization_args)
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 309, in quantize
quantize_fp16(
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 217, in quantize_fp16
model_fp16 = float16.convert_float_to_float16(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py", line 273, in convert_float_to_float16
process_graph_output(curr_graph, is_top_level, keep_io_types)
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py", line 372, in process_graph_output
assert len(upstream_nodes) == 1 # Should be only one node
^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
❌ Based on decoder_with_past_model.onnx
without slimming
0%| | 0/1 [00:00<?, ?it/s]
Processing /tmp/tmpsrowz1_3/decoder_with_past_model.onnx: 0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/7 [00:00<?, ?it/s][A
- Quantizing to fp16: 0%| | 0/7 [00:00<?, ?it/s][A/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py:85: UserWarning: the float32 number -3.4028234663852886e+38 will be truncated to -10000.0
warnings.warn(
- Quantizing to fp16: 0%| | 0/7 [00:00<?, ?it/s]
Processing /tmp/tmpsrowz1_3/decoder_with_past_model.onnx: 0%| | 0/1 [00:00<?, ?it/s]
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 377, in <module>
main()
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 374, in main
quantize(input_folder, output_folder, quantization_args)
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 309, in quantize
quantize_fp16(
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/quantize.py", line 217, in quantize_fp16
model_fp16 = float16.convert_float_to_float16(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py", line 273, in convert_float_to_float16
process_graph_output(curr_graph, is_top_level, keep_io_types)
File "/home/ubuntu/src/tjsmigration/transformers.js/scripts/float16.py", line 372, in process_graph_output
assert len(upstream_nodes) == 1 # Should be only one node
^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError