Source code for mindspore_serving.server._server

# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Interface for start up servable"""
import os
import time
import threading
import signal

import mindspore_serving.log as logger
from mindspore_serving.server.worker import init_mindspore
from mindspore_serving.server.master import start_master_server, stop_on_except, stop, at_stop_list, only_model_stage
from mindspore_serving.server._servable_common import WorkerContext
from mindspore_serving.server._servable_local import ServableStartConfig, ServableContextData, merge_config
from mindspore_serving.server._servable_local import ServableExtraContextData
from mindspore_serving.server.distributed._servable_distributed import DistributedStartConfig, DistributedContextData
from mindspore_serving._mindspore_serving import ExitSignalHandle_


[docs]@stop_on_except def start_servables(servable_configs): r""" Start up servables. It can be used to start multiple different servables. One servable can be deployed on multiple chips, and each chip runs a servable copy. On Ascend 910 hardware platform, each copy of each servable owns one chip. Different servables or different versions of the same servable need to be deployed on different chips. On Ascend 310 and GPU hardware platform, one chip can be shared by multi servables, and different servables or different versions of the same servable can be deployed on the same chip to realize chip reuse. Args: servable_configs (Union[ServableStartConfig, list[ServableStartConfig], tuple[ServableStartConfig]]): The startup configs of one or more servables. Raises: RuntimeError: Failed to start one or more servables. For log of one servable, please refer to subdirectory serving_logs. Examples: >>> import os >>> from mindspore_serving import server >>> >>> servable_dir = os.path.abspath(".") >>> resnet_config = server.ServableStartConfig(servable_dir, "resnet", device_ids=(0,1)) >>> add_config = server.ServableStartConfig(servable_dir, "add", device_ids=(2,3)) >>> server.start_servables(servable_configs=(resnet_config, add_config)) # press Ctrl+C to stop >>> server.start_grpc_server("0.0.0.0:5500") """ if isinstance(servable_configs, (ServableStartConfig, DistributedStartConfig)): servable_configs = (servable_configs,) if not isinstance(servable_configs, (tuple, list)): raise RuntimeError(f"Parameter '{servable_configs}' should be ServableStartConfig, list or tuple of " f"ServableStartConfig, but actually {type(servable_configs)}") for config in servable_configs: if not isinstance(config, (ServableStartConfig, DistributedStartConfig)): raise RuntimeError( f"The item of parameter '{servable_configs}' should be ServableStartConfig, but actually " f"{type(config)}") init_mindspore.set_mindspore_cxx_env() # merge ServableStartConfig with same servable name and running version number try: servable_configs = merge_config(servable_configs) except RuntimeError as e: logger.error(f"Start servables failed: {str(e)}") raise logger.info("Servable configs:") for config in servable_configs: if isinstance(config, ServableStartConfig): logger.info( f"servable directory: {config.servable_directory}, servable name: {config.servable_name}, " f"running version number: {config.version_number}, device ids:{config.device_ids}, " f"device type: {config.device_type}") if isinstance(config, DistributedStartConfig): logger.info(f"distributed servable, servable directory: {config.servable_directory}, " f"servable name: {config.servable_name}, rank table json file: {config.rank_table_json_file}, " f"running version number: {config.version_number}, " f"distributed address:{config.distributed_address}, " f"wait agents time: {config.wait_agents_time_in_seconds}s") master_pid = os.getpid() unix_socket_dir = "unix_socket_files" try: os.mkdir(unix_socket_dir) except FileExistsError: pass master_address = f"unix:{unix_socket_dir}/serving_master_{master_pid}" start_master_server(address=master_address) signal.signal(signal.SIGCHLD, signal.SIG_IGN) worker_list = _start_workers_with_devices(master_address, servable_configs) _listening_workers_when_startup(worker_list) extra_worker_list = _start_extra_workers(master_address, servable_configs) worker_list.extend(extra_worker_list) _listening_workers_after_startup(worker_list)
def _start_workers_with_devices(master_address, servable_configs): """Start workers that occupy devices""" worker_list = [] for config in servable_configs: if isinstance(config, ServableStartConfig): for device_id in config.device_ids: try: context_data = ServableContextData(config, device_id, master_address) sub_process = context_data.new_worker_process() worker_context = WorkerContext(context_data, master_address, sub_process) except RuntimeError as e: _send_exit_signal_to_children(worker_list) raise RuntimeError(f"Start worker failed: {e}") worker_list.append(worker_context) elif isinstance(config, DistributedStartConfig): try: context_data = DistributedContextData(config, master_address) sub_process = context_data.new_worker_process() worker_context = WorkerContext(context_data, master_address, sub_process) except RuntimeError as e: _send_exit_signal_to_children(worker_list) raise RuntimeError(f"Start worker failed: {e}") worker_list.append(worker_context) return worker_list def _start_extra_workers(master_address, servable_configs): """Start workers that do not occupy devices""" worker_list = [] for config in servable_configs: if not isinstance(config, ServableStartConfig): continue if len(config.device_ids) >= config.num_parallel_workers: continue if only_model_stage(config.servable_name): logger.warning(f"There is no need to startup additional worker processes, all stages are models, servable:" f" {config.servable_name}") continue extra_worker_count = config.num_parallel_workers - len(config.device_ids) for index in range(extra_worker_count): try: context_data = ServableExtraContextData(config, master_address, index) sub_process = context_data.new_worker_process() worker_context = WorkerContext(context_data, master_address, sub_process) except RuntimeError as e: _send_exit_signal_to_children(worker_list) raise RuntimeError(f"Start worker failed: {e}") worker_list.append(worker_context) _listening_workers_when_startup(worker_list) return worker_list def _send_exit_signal_to_children(worker_list): """Send exit signal to all child processes, and terminate all child processes when they are still alive in some seconds later""" if not worker_list: return for worker in worker_list: worker.send_exit_signal(signal.SIGINT) wait_seconds = 10 for i in range(wait_seconds * 100): # 10s all_exit = True for worker in worker_list: if worker.is_alive(): if i % 100 == 0: logger.warning(f"Wait for all worker processes to exit, otherwise they will be forcibly killed in " f"{wait_seconds - (i // 100)} seconds.") all_exit = False break if all_exit: logger.info(f"All Child process exited") return time.sleep(0.01) for worker in worker_list: worker.send_exit_signal(signal.SIGKILL) def _listening_workers_when_startup(worker_list): """Listening child process""" if not worker_list: return time_last = time.time() while True: time.sleep(0.1) if ExitSignalHandle_.has_stopped(): logger.warning("Fail to start workers because of signal SIGINT or SIGTERM") _send_exit_signal_to_children(worker_list) raise RuntimeError("Fail to start workers because of signal SIGINT or SIGTERM") all_ready = True for worker in worker_list: if not worker.is_alive() or worker.has_error_notified(): for _ in range(100): if worker.has_error_notified(): logger.warning(f"Fail to start workers: {worker.get_notified_error()}") _send_exit_signal_to_children(worker_list) raise RuntimeError(f"Fail to start workers: {worker.get_notified_error()}") time.sleep(0.01) # wait 1s for error msg logger.error(f"Fail to start workers because of death of one worker") _send_exit_signal_to_children(worker_list) raise RuntimeError("Fail to start workers because of death of one worker") if not worker.ready(): if time.time() - time_last > 1: time_last = time.time() worker.print_status() all_ready = False if all_ready: break logger.info("All workers is ready") def _listening_workers_after_startup(worker_list): """Listening agent status after success start up of agents""" def listening_thread_fun(): while True: time.sleep(0.01) if ExitSignalHandle_.has_stopped(): logger.warning("Serving server begin to exit: receive exit signal") break alive_count = 0 for worker in worker_list: occupy_device_worker = 1 if worker.own_device() else 0 if worker.is_in_process_switching: alive_count += occupy_device_worker continue if worker.is_alive(): alive_count += occupy_device_worker if worker.is_unavailable(): worker.restart_worker() continue # not alive # has exit or error notified, if worker.has_exit_notified() or worker.has_error_notified(): continue if worker.exit_for_enough_time(): # has exit for 1s and there were no normal handled requests if not worker.can_be_restart(): continue logger.warning( f"detect worker process has exited, try to restart, servable: {worker.to_string()}") worker.restart_worker() alive_count += occupy_device_worker if not alive_count: logger.warning("Serving server begin to exit: all worker processes that occupy devices have exited") break _send_exit_signal_to_children(worker_list) stop() thread = threading.Thread(target=listening_thread_fun) thread.start() def join_thread(): if thread != threading.current_thread(): thread.join() return True return False at_stop_list.append(join_thread)