vllm.envs ¶
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE module-attribute
¶
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = (
False
)
VLLM_ALLOW_INSECURE_SERIALIZATION module-attribute
¶
VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING module-attribute
¶
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION module-attribute
¶
VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False
VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS module-attribute
¶
VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES module-attribute
¶
VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
VLLM_LOGITS_PROCESSOR_THREADS module-attribute
¶
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE module-attribute
¶
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME module-attribute
¶
VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = (
"VLLM_OBJECT_STORAGE_SHM_BUFFER"
)
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16 module-attribute
¶
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB module-attribute
¶
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION module-attribute
¶
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = 'NONE'
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL module-attribute
¶
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS module-attribute
¶
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
VLLM_TORCH_PROFILER_RECORD_SHAPES module-attribute
¶
VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY module-attribute
¶
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 module-attribute
¶
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 module-attribute
¶
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS module-attribute
¶
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE module-attribute
¶
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = 'auto'
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM module-attribute
¶
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
VLLM_V1_USE_PREFILL_DECODE_ATTENTION module-attribute
¶
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
VLLM_XLA_CACHE_PATH module-attribute
¶
VLLM_XLA_CACHE_PATH: str = join(
VLLM_CACHE_ROOT, "xla_cache"
)
environment_variables module-attribute
¶
environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_TARGET_DEVICE": lambda: lower(),
"MAX_JOBS": lambda: getenv("MAX_JOBS", None),
"NVCC_THREADS": lambda: getenv("NVCC_THREADS", None),
"VLLM_USE_PRECOMPILED": lambda: lower() in ("1", "true")
or bool(get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
"VLLM_DOCKER_BUILD_CONTEXT": lambda: lower()
in ("1", "true"),
"VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool(
int(
getenv(
"VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL",
"0",
)
)
),
"CMAKE_BUILD_TYPE": lambda: getenv("CMAKE_BUILD_TYPE"),
"VERBOSE": lambda: bool(int(getenv("VERBOSE", "0"))),
"VLLM_CONFIG_ROOT": lambda: expanduser(
getenv(
"VLLM_CONFIG_ROOT",
join(get_default_config_root(), "vllm"),
)
),
"VLLM_CACHE_ROOT": lambda: expanduser(
getenv(
"VLLM_CACHE_ROOT",
join(get_default_cache_root(), "vllm"),
)
),
"VLLM_HOST_IP": lambda: getenv("VLLM_HOST_IP", ""),
"VLLM_PORT": get_vllm_port,
"VLLM_RPC_BASE_PATH": lambda: getenv(
"VLLM_RPC_BASE_PATH", gettempdir()
),
"VLLM_USE_MODELSCOPE": lambda: lower() == "true",
"VLLM_RINGBUFFER_WARNING_INTERVAL": lambda: int(
get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")
),
"CUDA_HOME": lambda: get("CUDA_HOME", None),
"VLLM_NCCL_SO_PATH": lambda: get(
"VLLM_NCCL_SO_PATH", None
),
"LD_LIBRARY_PATH": lambda: get("LD_LIBRARY_PATH", None),
"VLLM_USE_TRITON_FLASH_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: lower()
in ("true", "1"),
"VLLM_USE_AITER_UNIFIED_ATTENTION": lambda: lower()
in ("true", "1"),
"VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
get("VLLM_FLASH_ATTN_VERSION", None)
),
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": lambda: bool(
get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
!= "0"
),
"VLLM_USE_STANDALONE_COMPILE": lambda: get(
"VLLM_USE_STANDALONE_COMPILE", "1"
)
== "1",
"LOCAL_RANK": lambda: int(get("LOCAL_RANK", "0")),
"CUDA_VISIBLE_DEVICES": lambda: get(
"CUDA_VISIBLE_DEVICES", None
),
"VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
),
"VLLM_API_KEY": lambda: get("VLLM_API_KEY", None),
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE": lambda: lower()
== "true",
"S3_ACCESS_KEY_ID": lambda: get(
"S3_ACCESS_KEY_ID", None
),
"S3_SECRET_ACCESS_KEY": lambda: get(
"S3_SECRET_ACCESS_KEY", None
),
"S3_ENDPOINT_URL": lambda: get("S3_ENDPOINT_URL", None),
"VLLM_USAGE_STATS_SERVER": lambda: get(
"VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"
),
"VLLM_NO_USAGE_STATS": lambda: get(
"VLLM_NO_USAGE_STATS", "0"
)
== "1",
"VLLM_DO_NOT_TRACK": lambda: (
get("VLLM_DO_NOT_TRACK", None)
or get("DO_NOT_TRACK", None)
or "0"
)
== "1",
"VLLM_USAGE_SOURCE": lambda: get(
"VLLM_USAGE_SOURCE", "production"
),
"VLLM_CONFIGURE_LOGGING": lambda: int(
getenv("VLLM_CONFIGURE_LOGGING", "1")
),
"VLLM_LOGGING_CONFIG_PATH": lambda: getenv(
"VLLM_LOGGING_CONFIG_PATH"
),
"VLLM_LOGGING_LEVEL": lambda: upper(),
"VLLM_LOGGING_STREAM": lambda: getenv(
"VLLM_LOGGING_STREAM", "ext://sys.stdout"
),
"VLLM_LOGGING_PREFIX": lambda: getenv(
"VLLM_LOGGING_PREFIX", ""
),
"VLLM_LOGITS_PROCESSOR_THREADS": lambda: int(
getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")
)
if "VLLM_LOGITS_PROCESSOR_THREADS" in environ
else None,
"VLLM_LOG_STATS_INTERVAL": lambda: val
if (
val := (
float(getenv("VLLM_LOG_STATS_INTERVAL", "10."))
)
)
> 0.0
else 10.0,
"VLLM_TRACE_FUNCTION": lambda: int(
getenv("VLLM_TRACE_FUNCTION", "0")
),
"VLLM_ATTENTION_BACKEND": lambda: getenv(
"VLLM_ATTENTION_BACKEND", None
),
"VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
int(environ["VLLM_USE_FLASHINFER_SAMPLER"])
)
if "VLLM_USE_FLASHINFER_SAMPLER" in environ
else None,
"VLLM_PP_LAYER_PARTITION": lambda: getenv(
"VLLM_PP_LAYER_PARTITION", None
),
"VLLM_CPU_KVCACHE_SPACE": lambda: int(
getenv("VLLM_CPU_KVCACHE_SPACE", "0")
)
if "VLLM_CPU_KVCACHE_SPACE" in environ
else None,
"VLLM_CPU_OMP_THREADS_BIND": lambda: getenv(
"VLLM_CPU_OMP_THREADS_BIND", "auto"
),
"VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
)
if "VLLM_CPU_NUM_OF_RESERVED_CPU" in environ
else None,
"VLLM_CPU_MOE_PREPACK": lambda: bool(
int(getenv("VLLM_CPU_MOE_PREPACK", "1"))
),
"VLLM_CPU_SGL_KERNEL": lambda: bool(
int(getenv("VLLM_CPU_SGL_KERNEL", "0"))
),
"VLLM_USE_RAY_SPMD_WORKER": lambda: bool(
int(getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))
),
"VLLM_USE_RAY_COMPILED_DAG": lambda: bool(
int(getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))
),
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": lambda: getenv(
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"
),
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool(
int(
getenv(
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM",
"0",
)
)
),
"VLLM_USE_RAY_WRAPPED_PP_COMM": lambda: bool(
int(getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))
),
"VLLM_WORKER_MULTIPROC_METHOD": lambda: getenv(
"VLLM_WORKER_MULTIPROC_METHOD", "fork"
),
"VLLM_ASSETS_CACHE": lambda: expanduser(
getenv(
"VLLM_ASSETS_CACHE",
join(
get_default_cache_root(), "vllm", "assets"
),
)
),
"VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")
),
"VLLM_VIDEO_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")
),
"VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
),
"VLLM_MEDIA_LOADING_THREAD_COUNT": lambda: int(
getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")
),
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int(
getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")
),
"VLLM_VIDEO_LOADER_BACKEND": lambda: getenv(
"VLLM_VIDEO_LOADER_BACKEND", "opencv"
),
"VLLM_MM_INPUT_CACHE_GIB": lambda: int(
getenv("VLLM_MM_INPUT_CACHE_GIB", "4")
),
"VLLM_XLA_CACHE_PATH": lambda: expanduser(
getenv(
"VLLM_XLA_CACHE_PATH",
join(
get_default_cache_root(),
"vllm",
"xla_cache",
),
)
),
"VLLM_XLA_CHECK_RECOMPILATION": lambda: bool(
int(getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))
),
"VLLM_XLA_USE_SPMD": lambda: bool(
int(getenv("VLLM_XLA_USE_SPMD", "0"))
),
"VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
),
"VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING": lambda: bool(
int(
getenv(
"VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING",
"1",
)
)
),
"VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool(
getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)
),
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": lambda: lower()
in ("1", "true"),
"VLLM_TEST_FORCE_FP8_MARLIN": lambda: lower()
in ("1", "true"),
"VLLM_TEST_FORCE_LOAD_FORMAT": lambda: getenv(
"VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
),
"VLLM_RPC_TIMEOUT": lambda: int(
getenv("VLLM_RPC_TIMEOUT", "10000")
),
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
),
"VLLM_PLUGINS": lambda: None
if "VLLM_PLUGINS" not in environ
else split(","),
"VLLM_LORA_RESOLVER_CACHE_DIR": lambda: getenv(
"VLLM_LORA_RESOLVER_CACHE_DIR", None
),
"VLLM_TORCH_PROFILER_DIR": lambda: None
if getenv("VLLM_TORCH_PROFILER_DIR", None) is None
else abspath(
expanduser(getenv("VLLM_TORCH_PROFILER_DIR", "."))
),
"VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool(
getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0")
!= "0"
),
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool(
getenv(
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0"
)
!= "0"
),
"VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool(
getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
),
"VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
),
"VLLM_USE_TRITON_AWQ": lambda: bool(
int(getenv("VLLM_USE_TRITON_AWQ", "0"))
),
"VLLM_ALLOW_RUNTIME_LORA_UPDATING": lambda: lower()
in ("1", "true"),
"VLLM_SKIP_P2P_CHECK": lambda: getenv(
"VLLM_SKIP_P2P_CHECK", "1"
)
== "1",
"VLLM_DISABLED_KERNELS": lambda: []
if "VLLM_DISABLED_KERNELS" not in environ
else split(","),
"VLLM_USE_V1": lambda: bool(
int(getenv("VLLM_USE_V1", "1"))
),
"VLLM_ROCM_USE_AITER": lambda: lower() in ("true", "1"),
"VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_LINEAR": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MOE": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_RMSNORM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MLA": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MHA": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_FP8BMM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_SKINNY_GEMM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_FP8_PADDING": lambda: bool(
int(getenv("VLLM_ROCM_FP8_PADDING", "1"))
),
"VLLM_ROCM_MOE_PADDING": lambda: bool(
int(getenv("VLLM_ROCM_MOE_PADDING", "1"))
),
"VLLM_ROCM_CUSTOM_PAGED_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": lambda: upper(),
"VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB": lambda: maybe_convert_int(
get(
"VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None
)
),
"Q_SCALE_CONSTANT": lambda: int(
getenv("Q_SCALE_CONSTANT", "200")
),
"K_SCALE_CONSTANT": lambda: int(
getenv("K_SCALE_CONSTANT", "200")
),
"V_SCALE_CONSTANT": lambda: int(
getenv("V_SCALE_CONSTANT", "100")
),
"VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(
int(getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))
),
"VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(
getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")
),
"VLLM_DISABLE_COMPILE_CACHE": lambda: bool(
int(getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))
),
"VLLM_SERVER_DEV_MODE": lambda: bool(
int(getenv("VLLM_SERVER_DEV_MODE", "0"))
),
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": lambda: int(
getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")
),
"VLLM_MLA_DISABLE": lambda: bool(
int(getenv("VLLM_MLA_DISABLE", "0"))
),
"VLLM_RAY_PER_WORKER_GPUS": lambda: float(
getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")
),
"VLLM_RAY_BUNDLE_INDICES": lambda: getenv(
"VLLM_RAY_BUNDLE_INDICES", ""
),
"VLLM_CUDART_SO_PATH": lambda: getenv(
"VLLM_CUDART_SO_PATH", None
),
"VLLM_DP_RANK": lambda: int(
getenv("VLLM_DP_RANK", "0")
),
"VLLM_DP_RANK_LOCAL": lambda: int(
getenv("VLLM_DP_RANK_LOCAL", VLLM_DP_RANK)
),
"VLLM_DP_SIZE": lambda: int(
getenv("VLLM_DP_SIZE", "1")
),
"VLLM_DP_MASTER_IP": lambda: getenv(
"VLLM_DP_MASTER_IP", "127.0.0.1"
),
"VLLM_DP_MASTER_PORT": lambda: int(
getenv("VLLM_DP_MASTER_PORT", "0")
),
"VLLM_MOE_DP_CHUNK_SIZE": lambda: int(
getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")
),
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: get(
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
)
== "1",
"VLLM_CI_USE_S3": lambda: get("VLLM_CI_USE_S3", "0")
== "1",
"VLLM_MODEL_REDIRECT_PATH": lambda: get(
"VLLM_MODEL_REDIRECT_PATH", None
),
"VLLM_MARLIN_USE_ATOMIC_ADD": lambda: get(
"VLLM_MARLIN_USE_ATOMIC_ADD", "0"
)
== "1",
"VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
get("VLLM_MXFP4_USE_MARLIN", None)
),
"VLLM_V0_USE_OUTLINES_CACHE": lambda: get(
"VLLM_V0_USE_OUTLINES_CACHE", "0"
)
== "1",
"VLLM_V1_USE_OUTLINES_CACHE": lambda: get(
"VLLM_V1_USE_OUTLINES_CACHE", "0"
)
== "1",
"VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
environ["VLLM_TPU_BUCKET_PADDING_GAP"]
)
if "VLLM_TPU_BUCKET_PADDING_GAP" in environ
else 0,
"VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
get("VLLM_TPU_MOST_MODEL_LEN", None)
),
"VLLM_TPU_USING_PATHWAYS": lambda: bool(
"proxy" in lower()
),
"VLLM_USE_DEEP_GEMM": lambda: bool(
int(getenv("VLLM_USE_DEEP_GEMM", "0"))
),
"VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
int(getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
),
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER": lambda: bool(
int(getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))
),
"VLLM_SKIP_DEEP_GEMM_WARMUP": lambda: bool(
int(getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))
),
"VLLM_USE_FUSED_MOE_GROUPED_TOPK": lambda: bool(
int(getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))
),
"VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(
int(getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))
),
"VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool(
int(getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))
),
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": lambda: bool(
int(
getenv(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"
)
)
),
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": lambda: bool(
int(
getenv(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
"0",
)
)
),
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool(
int(
getenv(
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"
)
)
),
"VLLM_XGRAMMAR_CACHE_MB": lambda: int(
getenv("VLLM_XGRAMMAR_CACHE_MB", "512")
),
"VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int(
getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")
),
"VLLM_ALLOW_INSECURE_SERIALIZATION": lambda: bool(
int(
getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0")
)
),
"VLLM_NIXL_SIDE_CHANNEL_HOST": lambda: getenv(
"VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"
),
"VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int(
getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")
),
"VLLM_ALL2ALL_BACKEND": lambda: getenv(
"VLLM_ALL2ALL_BACKEND", "naive"
),
"VLLM_FLASHINFER_MOE_BACKEND": lambda: getenv(
"VLLM_FLASHINFER_MOE_BACKEND", "throughput"
),
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(
getenv(
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840"
)
),
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB": lambda: loads(
getenv(
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB",
"{}",
)
),
"VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: lower(),
"VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int(
getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")
),
"VLLM_SLEEP_WHEN_IDLE": lambda: bool(
int(getenv("VLLM_SLEEP_WHEN_IDLE", "0"))
),
"VLLM_MQ_MAX_CHUNK_BYTES_MB": lambda: int(
getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")
),
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": lambda: int(
getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")
),
"VLLM_KV_CACHE_LAYOUT": lambda: getenv(
"VLLM_KV_CACHE_LAYOUT", None
),
"VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool(
int(getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))
),
"VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool(
int(getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))
),
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int(
getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")
),
"VLLM_USE_CUDNN_PREFILL": lambda: bool(
int(getenv("VLLM_USE_CUDNN_PREFILL", "0"))
),
"VLLM_USE_TRTLLM_ATTENTION": lambda: getenv(
"VLLM_USE_TRTLLM_ATTENTION", None
),
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": lambda: bool(
int(
getenv(
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
"0",
)
)
),
"VLLM_HAS_FLASHINFER_CUBIN": lambda: getenv(
"VLLM_HAS_FLASHINFER_CUBIN", False
),
"VLLM_USE_TRTLLM_FP4_GEMM": lambda: bool(
int(getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))
),
"VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool(
int(getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))
),
"VLLM_DISABLE_PAD_FOR_CUDAGRAPH": lambda: bool(
int(getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))
),
"VLLM_LOOPBACK_IP": lambda: getenv(
"VLLM_LOOPBACK_IP", ""
),
"VLLM_PROCESS_NAME_PREFIX": lambda: getenv(
"VLLM_PROCESS_NAME_PREFIX", "VLLM"
),
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(
int(
getenv(
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE",
"0",
)
)
),
"VLLM_ENABLE_RESPONSES_API_STORE": lambda: bool(
int(getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))
),
"VLLM_ROCM_FP8_MFMA_PAGE_ATTN": lambda: bool(
int(getenv("VLLM_ROCM_FP8_MFMA_PAGE_ATTN", "0"))
),
"VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
int(getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))
),
"VLLM_TUNED_CONFIG_FOLDER": lambda: getenv(
"VLLM_TUNED_CONFIG_FOLDER", None
),
"VLLM_GPT_OSS_USE_CONTAINER_TOOL": lambda: bool(
int(getenv("VLLM_GPT_OSS_USE_CONTAINER_TOOL", "0"))
),
"VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": lambda: bool(
int(
getenv(
"VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS",
"0",
)
)
),
"VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(
int(getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))
),
"VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES": lambda: bool(
int(
getenv(
"VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"
)
)
),
"VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME": lambda: getenv(
"VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME",
"VLLM_OBJECT_STORAGE_SHM_BUFFER",
),
}
__dir__ ¶
compute_hash ¶
compute_hash() -> str
WARNING: Whenever a new key is added to this environment variables, ensure that it is included in the factors list if it affects the computation graph. For example, different values of VLLM_PP_LAYER_PARTITION will generate different computation graphs, so it is included in the factors list. The env vars that affect the choice of different kernels or attention backends should also be included in the factors list.
Source code in vllm/envs.py
1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 |
|
get_default_cache_root ¶
get_default_config_root ¶
get_vllm_port ¶
Get the port from VLLM_PORT environment variable.
Returns:
Type | Description |
---|---|
Optional[int] | The port number as an integer if VLLM_PORT is set, None otherwise. |
Raises:
Type | Description |
---|---|
ValueError | If VLLM_PORT is a URI, suggest k8s service discovery issue. |