314 lines
9.8 KiB
Markdown
314 lines
9.8 KiB
Markdown
|
||
API参考资料
|
||
https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html
|
||
|
||
ray.serve.llm.LLMConfig
|
||
pydantic model ray.serve.llm.LLMConfig[source]
|
||
The configuration for starting an LLM deployment.
|
||
|
||
PublicAPI (alpha): This API is in alpha and may change before becoming stable.
|
||
|
||
field accelerator_type: str | None = None
|
||
The type of accelerator runs the model on. Only the following values are supported: [‘V100’, ‘P100’, ‘T4’, ‘P4’, ‘K80’, ‘A10G’, ‘L4’, ‘L40S’, ‘A100’, ‘H100’, ‘H200’, ‘H20’, ‘B200’, ‘Intel-GPU-Max-1550’, ‘Intel-GPU-Max-1100’, ‘Intel-GAUDI’, ‘AMD-Instinct-MI100’, ‘AMD-Instinct-MI250X’, ‘AMD-Instinct-MI250X-MI250’, ‘AMD-Instinct-MI210’, ‘AMD-Instinct-MI300A’, ‘AMD-Instinct-MI300X-OAM’, ‘AMD-Instinct-MI300X-HF’, ‘AMD-Instinct-MI308X’, ‘AMD-Instinct-MI325X-OAM’, ‘AMD-Instinct-MI350X-OAM’, ‘AMD-Instinct-MI355X-OAM’, ‘AMD-Radeon-R9-200-HD-7900’, ‘AMD-Radeon-HD-7900’, ‘aws-neuron-core’, ‘TPU-V2’, ‘TPU-V3’, ‘TPU-V4’, ‘TPU-V5P’, ‘TPU-V5LITEPOD’, ‘TPU-V6E’, ‘Ascend910B’, ‘Ascend910B4’, ‘MXC500’, ‘MXC550’, ‘A100-40G’, ‘A100-80G’]
|
||
|
||
field callback_config: CallbackConfig [Optional]
|
||
Callback configuration to use for model initialization. Can be a string path to a class or a Callback subclass.
|
||
|
||
field deployment_config: Dict[str, Any] [Optional]
|
||
The Ray @server.deployment options. Supported fields are: name, num_replicas, ray_actor_options, max_ongoing_requests, autoscaling_config, max_queued_requests, user_config, health_check_period_s, health_check_timeout_s, graceful_shutdown_wait_loop_s, graceful_shutdown_timeout_s, logging_config, request_router_config. For more details, see the Ray Serve Documentation.
|
||
|
||
field engine_kwargs: Dict[str, Any] = {}
|
||
Additional keyword arguments for the engine. In case of vLLM, this will include all the configuration knobs they provide out of the box, except for tensor-parallelism which is set automatically from Ray Serve configs.
|
||
|
||
field experimental_configs: Dict[str, Any] [Optional]
|
||
Experimental configurations for Ray Serve LLM. This is a dictionary of key-value pairs. Current supported keys are: - stream_batching_interval_ms: Ray Serve LLM batches streaming requests together. This config decides how long to wait for the batch before processing the requests. Defaults to 50.0. - num_ingress_replicas: The number of replicas for the router. Ray Serve will take the max amount all the replicas. Default would be 2 router replicas per model replica.
|
||
|
||
field llm_engine: str = 'vLLM'
|
||
The LLMEngine that should be used to run the model. Only the following values are supported: [‘vLLM’]
|
||
|
||
field log_engine_metrics: bool | None = True
|
||
Enable additional engine metrics via Ray Prometheus port.
|
||
|
||
field lora_config: Dict[str, Any] | LoraConfig | None = None
|
||
Settings for LoRA adapter. Validated against LoraConfig.
|
||
|
||
field model_loading_config: Dict[str, Any] | ModelLoadingConfig [Required]
|
||
The settings for how to download and expose the model. Validated against ModelLoadingConfig.
|
||
|
||
field placement_group_config: Dict[str, Any] | None = None
|
||
Ray placement group configuration for scheduling vLLM engine workers. Defines resource bundles and placement strategy for multi-node deployments. Should contain ‘bundles’ (list of resource dicts) and optionally ‘strategy’ (defaults to ‘PACK’). Example: {‘bundles’: [{‘GPU’: 1, ‘CPU’: 2}], ‘strategy’: ‘PACK’}
|
||
|
||
field runtime_env: Dict[str, Any] | None = None
|
||
The runtime_env to use for the model deployment replica and the engine workers.
|
||
|
||
apply_checkpoint_info(model_id_or_path: str, trust_remote_code: bool = False) → None[source]
|
||
Apply the checkpoint info to the model config.
|
||
|
||
classmethod from_file(path: str, **kwargs) → ModelT
|
||
Load a model from a YAML file path.
|
||
|
||
get_engine_config() → None | VLLMEngineConfig[source]
|
||
Returns the engine config for the given LLM config.
|
||
|
||
LLMConfig not only has engine config but also deployment config, etc.
|
||
|
||
get_or_create_callback() → CallbackBase | None[source]
|
||
Get or create the callback instance for this process.
|
||
|
||
This ensures one callback instance per process (singleton pattern). The instance is cached so the same object is used across all hooks.
|
||
|
||
Returns
|
||
:
|
||
Instance of class that implements Callback
|
||
|
||
multiplex_config() → ServeMultiplexConfig[source]
|
||
classmethod parse_yaml(file, **kwargs) → ModelT
|
||
setup_engine_backend()[source]
|
||
update_engine_kwargs(**kwargs: Any) → None[source]
|
||
Update the engine_kwargs and the engine_config engine_kwargs.
|
||
|
||
This is typically called during engine starts, when certain engine_kwargs (e.g., data_parallel_rank) become available.
|
||
|
||
validator validate_accelerator_type » accelerator_type[source]
|
||
validator validate_deployment_config » deployment_config[source]
|
||
Validates the deployment config dictionary.
|
||
|
||
validator validate_experimental_configs » experimental_configs[source]
|
||
Validates the experimental configs dictionary.
|
||
|
||
validator validate_llm_engine » llm_engine[source]
|
||
Validates the llm_engine string value.
|
||
|
||
validator validate_lora_config » lora_config[source]
|
||
Validates the lora config dictionary.
|
||
|
||
validator validate_model_loading_config » model_loading_config[source]
|
||
Validates the model loading config dictionary.
|
||
|
||
property input_modality: str
|
||
Returns the input modality of the model. There could be more types in the future. Right now assumes if the model doesn’t support version, it’ll be text.
|
||
|
||
property max_request_context_length: int | None
|
||
property model_architecture: str
|
||
property model_id: str
|
||
property supports_vision: bool
|
||
|
||
# Python API
|
||
ray serve api
|
||
https://docs.ray.io/en/latest/serve/api/index.html#serve-api
|
||
|
||
|
||
Python API
|
||
Writing Applications
|
||
serve.Deployment
|
||
|
||
Class (or function) decorated with the @serve.deployment decorator.
|
||
|
||
serve.Application
|
||
|
||
One or more deployments bound with arguments that can be deployed together.
|
||
|
||
Deployment Decorators
|
||
serve.deployment
|
||
|
||
Decorator that converts a Python class to a Deployment.
|
||
|
||
serve.ingress
|
||
|
||
Wrap a deployment class with an ASGI application for HTTP request parsing.
|
||
|
||
serve.batch
|
||
|
||
Converts a function to asynchronously handle batches.
|
||
|
||
serve.multiplexed
|
||
|
||
Wrap a callable or method used to load multiplexed models in a replica.
|
||
|
||
Deployment Handles
|
||
Note
|
||
|
||
The deprecated RayServeHandle and RayServeSyncHandle APIs have been fully removed as of Ray 2.10. See the model composition guide for how to update code to use the DeploymentHandle API instead.
|
||
|
||
serve.handle.DeploymentHandle
|
||
|
||
A handle used to make requests to a deployment at runtime.
|
||
|
||
serve.handle.DeploymentResponse
|
||
|
||
A future-like object wrapping the result of a unary deployment handle call.
|
||
|
||
serve.handle.DeploymentResponseGenerator
|
||
|
||
A future-like object wrapping the result of a streaming deployment handle call.
|
||
|
||
Running Applications
|
||
serve.start
|
||
|
||
Start Serve on the cluster.
|
||
|
||
serve.run
|
||
|
||
Run an application and return a handle to its ingress deployment.
|
||
|
||
serve.delete
|
||
|
||
Delete an application by its name.
|
||
|
||
serve.status
|
||
|
||
Get the status of Serve on the cluster.
|
||
|
||
serve.shutdown
|
||
|
||
Completely shut down Serve on the cluster.
|
||
|
||
serve.shutdown_async
|
||
|
||
Completely shut down Serve on the cluster asynchronously.
|
||
|
||
Configurations
|
||
serve.config.ProxyLocation
|
||
|
||
Config for where to run proxies to receive ingress traffic to the cluster.
|
||
|
||
serve.config.gRPCOptions
|
||
|
||
gRPC options for the proxies.
|
||
|
||
serve.config.HTTPOptions
|
||
|
||
HTTP options for the proxies.
|
||
|
||
serve.config.AutoscalingConfig
|
||
|
||
Config for the Serve Autoscaler.
|
||
|
||
serve.config.AutoscalingPolicy
|
||
|
||
PublicAPI (alpha): This API is in alpha and may change before becoming stable.
|
||
|
||
serve.config.AutoscalingContext
|
||
|
||
Rich context provided to custom autoscaling policies.
|
||
|
||
serve.config.AggregationFunction
|
||
|
||
An enumeration.
|
||
|
||
serve.config.RequestRouterConfig
|
||
|
||
Config for the Serve request router.
|
||
|
||
Schemas
|
||
serve.schema.ServeActorDetails
|
||
|
||
Detailed info about a Ray Serve actor.
|
||
|
||
serve.schema.ProxyDetails
|
||
|
||
Detailed info about a Ray Serve ProxyActor.
|
||
|
||
serve.schema.ApplicationStatusOverview
|
||
|
||
Describes the status of an application and all its deployments.
|
||
|
||
serve.schema.ServeStatus
|
||
|
||
Describes the status of Serve.
|
||
|
||
serve.schema.DeploymentStatusOverview
|
||
|
||
Describes the status of a deployment.
|
||
|
||
serve.schema.EncodingType
|
||
|
||
Encoding type for the serve logs.
|
||
|
||
serve.schema.AutoscalingMetricsHealth
|
||
|
||
An enumeration.
|
||
|
||
serve.schema.AutoscalingStatus
|
||
|
||
An enumeration.
|
||
|
||
serve.schema.ScalingDecision
|
||
|
||
One autoscaling decision with minimal provenance.
|
||
|
||
serve.schema.DeploymentAutoscalingDetail
|
||
|
||
Deployment-level autoscaler observability.
|
||
|
||
serve.schema.ReplicaRank
|
||
|
||
Replica rank model.
|
||
|
||
Request Router
|
||
serve.request_router.ReplicaID
|
||
|
||
A unique identifier for a replica.
|
||
|
||
serve.request_router.PendingRequest
|
||
|
||
A request that is pending execution by a replica.
|
||
|
||
serve.request_router.RunningReplica
|
||
|
||
Contains info on a running replica.
|
||
|
||
serve.request_router.FIFOMixin
|
||
|
||
Mixin for FIFO routing.
|
||
|
||
serve.request_router.LocalityMixin
|
||
|
||
Mixin for locality routing.
|
||
|
||
serve.request_router.MultiplexMixin
|
||
|
||
Mixin for multiplex routing.
|
||
|
||
serve.request_router.RequestRouter
|
||
|
||
Abstract interface for a request router (how the router calls it).
|
||
|
||
Advanced APIs
|
||
serve.get_replica_context
|
||
|
||
Returns the deployment and replica tag from within a replica at runtime.
|
||
|
||
serve.context.ReplicaContext
|
||
|
||
Stores runtime context info for replicas.
|
||
|
||
serve.get_multiplexed_model_id
|
||
|
||
Get the multiplexed model ID for the current request.
|
||
|
||
serve.get_app_handle
|
||
|
||
Get a handle to the application's ingress deployment by name.
|
||
|
||
serve.get_deployment_handle
|
||
|
||
Get a handle to a deployment by name.
|
||
|
||
serve.grpc_util.RayServegRPCContext
|
||
|
||
Context manager to set and get gRPC context.
|
||
|
||
serve.exceptions.BackPressureError
|
||
|
||
Raised when max_queued_requests is exceeded on a DeploymentHandle.
|
||
|
||
serve.exceptions.RayServeException
|
||
|
||
serve.exceptions.RequestCancelledError
|
||
|
||
Raise when a Serve request is cancelled.
|
||
|
||
serve.exceptions.DeploymentUnavailableError
|
||
|
||
Raised when a Serve deployment is unavailable to receive requests. |