def select_int8_moe_backend(
config: FusedMoEConfig,
weight_key: QuantKey | None = kInt8StaticChannelSym,
activation_key: QuantKey | None = kInt8DynamicTokenSym,
) -> tuple[Int8MoeBackend, type[mk.FusedMoEExperts]]:
"""
Select the primary Int8 MoE backend.
Note: Shape-specific fallbacks may still occur at runtime.
"""
if config.is_lora_enabled:
return Int8MoeBackend.TRITON, backend_to_kernel_cls(Int8MoeBackend.TRITON)[0]
AVAILABLE_BACKENDS = _get_priority_backends(config)
activation_format = (
mk.FusedMoEActivationFormat.BatchedExperts
if config.moe_parallel_config.use_batched_activation_format
else mk.FusedMoEActivationFormat.Standard
)
def _make_log_backend(backend: Int8MoeBackend) -> str:
available_backend_strs = [b.value for b in AVAILABLE_BACKENDS]
return (
f"Using {backend.value} Int8 MoE backend out "
f"of potential backends: {available_backend_strs}."
)
def _make_log_unsupported(backend: Int8MoeBackend, reason: str | None) -> str:
if reason:
return (
f"Int8 MoE backend {backend.value} does not support the "
f"deployment configuration since {reason}."
)
else:
return (
f"Int8 MoE backend '{backend.value}' does not support the "
"deployment configuration."
)
def _return_or_raise(
backend: Int8MoeBackend,
) -> tuple[Int8MoeBackend, type[mk.FusedMoEExperts]]:
for k_cls in backend_to_kernel_cls(backend):
supported, reason = k_cls.is_supported_config(
k_cls, config, weight_key, activation_key, activation_format
)
if supported:
logger.info_once(_make_log_backend(backend))
return backend, k_cls
raise ValueError(_make_log_unsupported(backend, reason))
# Handle explicit moe_backend from user.
runner_backend = config.moe_backend
if runner_backend != "auto":
requested_backend = map_int8_backend(runner_backend)
return _return_or_raise(requested_backend)
# Select kernels in order of backend.
for backend in AVAILABLE_BACKENDS:
for k_cls in backend_to_kernel_cls(backend):
supported, reason = k_cls.is_supported_config(
k_cls,
config,
weight_key,
activation_key,
activation_format,
)
if supported:
logger.info_once(_make_log_backend(backend))
return backend, k_cls
else:
logger.debug_once(_make_log_unsupported(backend, reason))
raise NotImplementedError(
"No Int8 MoE backend supports the deployment configuration."
)