Source code for mindspore_serving.server._server

# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Interface for start up servable"""
import os
import time
import threading
import signal

import mindspore_serving.log as logger
from mindspore_serving.server.worker import init_mindspore
from mindspore_serving.server.master import start_master_server, stop_on_except, stop, at_stop_list, only_model_stage
from mindspore_serving.server._servable_common import WorkerContext
from mindspore_serving.server._servable_local import ServableStartConfig, ServableContextData, merge_config
from mindspore_serving.server._servable_local import ServableExtraContextData
from mindspore_serving.server.distributed._servable_distributed import DistributedStartConfig, DistributedContextData
from mindspore_serving._mindspore_serving import ExitSignalHandle_


[docs]@stop_on_except
def start_servables(servable_configs):
    r"""
    Start up servables.

    It can be used to start multiple different servables. One servable can be deployed on multiple chips,
    and each chip runs a servable copy.

    On Ascend 910 hardware platform, each copy of each servable owns one chip. Different servables or different versions
    of the same servable need to be deployed on different chips.
    On Ascend 310 and GPU hardware platform, one chip can be shared by multi servables, and different servables or
    different versions of the same servable can be deployed on the same chip to realize chip reuse.

    Args:
        servable_configs (Union[ServableStartConfig, list[ServableStartConfig], tuple[ServableStartConfig]]): The
            startup configs of one or more servables.

    Raises:
        RuntimeError: Failed to start one or more servables. For log of one servable, please refer to subdirectory
            serving_logs.

    Examples:
        >>> import os
        >>> from mindspore_serving import server
        >>>
        >>> servable_dir = os.path.abspath(".")
        >>> resnet_config = server.ServableStartConfig(servable_dir, "resnet", device_ids=(0,1))
        >>> add_config = server.ServableStartConfig(servable_dir, "add", device_ids=(2,3))
        >>> server.start_servables(servable_configs=(resnet_config, add_config))  # press Ctrl+C to stop
        >>> server.start_grpc_server("0.0.0.0:5500")
    """
    if isinstance(servable_configs, (ServableStartConfig, DistributedStartConfig)):
        servable_configs = (servable_configs,)
    if not isinstance(servable_configs, (tuple, list)):
        raise RuntimeError(f"Parameter '{servable_configs}' should be ServableStartConfig, list or tuple of "
                           f"ServableStartConfig, but actually {type(servable_configs)}")
    for config in servable_configs:
        if not isinstance(config, (ServableStartConfig, DistributedStartConfig)):
            raise RuntimeError(
                f"The item of parameter '{servable_configs}' should be ServableStartConfig, but actually "
                f"{type(config)}")

    init_mindspore.set_mindspore_cxx_env()
    # merge ServableStartConfig with same servable name and running version number
    try:
        servable_configs = merge_config(servable_configs)
    except RuntimeError as e:
        logger.error(f"Start servables failed: {str(e)}")
        raise
    logger.info("Servable configs:")
    for config in servable_configs:
        if isinstance(config, ServableStartConfig):
            logger.info(
                f"servable directory: {config.servable_directory}, servable name: {config.servable_name}, "
                f"running version number: {config.version_number}, device ids:{config.device_ids}, "
                f"device type: {config.device_type}")
        if isinstance(config, DistributedStartConfig):
            logger.info(f"distributed servable, servable directory: {config.servable_directory}, "
                        f"servable name: {config.servable_name}, rank table json file: {config.rank_table_json_file}, "
                        f"running version number: {config.version_number}, "
                        f"distributed address:{config.distributed_address}, "
                        f"wait agents time: {config.wait_agents_time_in_seconds}s")

    master_pid = os.getpid()
    unix_socket_dir = "unix_socket_files"
    try:
        os.mkdir(unix_socket_dir)
    except FileExistsError:
        pass
    master_address = f"unix:{unix_socket_dir}/serving_master_{master_pid}"
    start_master_server(address=master_address)

    signal.signal(signal.SIGCHLD, signal.SIG_IGN)

    worker_list = _start_workers_with_devices(master_address, servable_configs)
    _listening_workers_when_startup(worker_list)
    extra_worker_list = _start_extra_workers(master_address, servable_configs)
    worker_list.extend(extra_worker_list)
    _listening_workers_after_startup(worker_list)


def _start_workers_with_devices(master_address, servable_configs):
    """Start workers that occupy devices"""
    worker_list = []
    for config in servable_configs:
        if isinstance(config, ServableStartConfig):
            for device_id in config.device_ids:
                try:
                    context_data = ServableContextData(config, device_id, master_address)
                    sub_process = context_data.new_worker_process()
                    worker_context = WorkerContext(context_data, master_address, sub_process)
                except RuntimeError as e:
                    _send_exit_signal_to_children(worker_list)
                    raise RuntimeError(f"Start worker failed: {e}")
                worker_list.append(worker_context)
        elif isinstance(config, DistributedStartConfig):
            try:
                context_data = DistributedContextData(config, master_address)
                sub_process = context_data.new_worker_process()
                worker_context = WorkerContext(context_data, master_address, sub_process)
            except RuntimeError as e:
                _send_exit_signal_to_children(worker_list)
                raise RuntimeError(f"Start worker failed: {e}")
            worker_list.append(worker_context)
    return worker_list


def _start_extra_workers(master_address, servable_configs):
    """Start workers that do not occupy devices"""
    worker_list = []
    for config in servable_configs:
        if not isinstance(config, ServableStartConfig):
            continue
        if len(config.device_ids) >= config.num_parallel_workers:
            continue
        if only_model_stage(config.servable_name):
            logger.warning(f"There is no need to startup additional worker processes, all stages are models, servable:"
                           f" {config.servable_name}")
            continue
        extra_worker_count = config.num_parallel_workers - len(config.device_ids)
        for index in range(extra_worker_count):
            try:
                context_data = ServableExtraContextData(config, master_address, index)
                sub_process = context_data.new_worker_process()
                worker_context = WorkerContext(context_data, master_address, sub_process)
            except RuntimeError as e:
                _send_exit_signal_to_children(worker_list)
                raise RuntimeError(f"Start worker failed: {e}")
            worker_list.append(worker_context)
    _listening_workers_when_startup(worker_list)
    return worker_list


def _send_exit_signal_to_children(worker_list):
    """Send exit signal to all child processes, and terminate all child processes when they are still alive
    in some seconds later"""
    if not worker_list:
        return
    for worker in worker_list:
        worker.send_exit_signal(signal.SIGINT)
    wait_seconds = 10
    for i in range(wait_seconds * 100):  # 10s
        all_exit = True
        for worker in worker_list:
            if worker.is_alive():
                if i % 100 == 0:
                    logger.warning(f"Wait for all worker processes to exit, otherwise they will be forcibly killed in "
                                   f"{wait_seconds - (i // 100)} seconds.")
                all_exit = False
                break
        if all_exit:
            logger.info(f"All Child process exited")
            return
        time.sleep(0.01)

    for worker in worker_list:
        worker.send_exit_signal(signal.SIGKILL)


def _listening_workers_when_startup(worker_list):
    """Listening child process"""
    if not worker_list:
        return
    time_last = time.time()
    while True:
        time.sleep(0.1)
        if ExitSignalHandle_.has_stopped():
            logger.warning("Fail to start workers because of signal SIGINT or SIGTERM")
            _send_exit_signal_to_children(worker_list)
            raise RuntimeError("Fail to start workers because of signal SIGINT or SIGTERM")

        all_ready = True
        for worker in worker_list:
            if not worker.is_alive() or worker.has_error_notified():
                for _ in range(100):
                    if worker.has_error_notified():
                        logger.warning(f"Fail to start workers: {worker.get_notified_error()}")
                        _send_exit_signal_to_children(worker_list)
                        raise RuntimeError(f"Fail to start workers: {worker.get_notified_error()}")
                    time.sleep(0.01)  # wait 1s for error msg
                logger.error(f"Fail to start workers because of death of one worker")
                _send_exit_signal_to_children(worker_list)
                raise RuntimeError("Fail to start workers because of death of one worker")
            if not worker.ready():
                if time.time() - time_last > 1:
                    time_last = time.time()
                    worker.print_status()
                all_ready = False
        if all_ready:
            break
    logger.info("All workers is ready")


def _listening_workers_after_startup(worker_list):
    """Listening agent status after success start up of agents"""

    def listening_thread_fun():
        while True:
            time.sleep(0.01)
            if ExitSignalHandle_.has_stopped():
                logger.warning("Serving server begin to exit: receive exit signal")
                break
            alive_count = 0
            for worker in worker_list:
                occupy_device_worker = 1 if worker.own_device() else 0
                if worker.is_in_process_switching:
                    alive_count += occupy_device_worker
                    continue
                if worker.is_alive():
                    alive_count += occupy_device_worker
                    if worker.is_unavailable():
                        worker.restart_worker()
                    continue
                # not alive
                # has exit or error notified,
                if worker.has_exit_notified() or worker.has_error_notified():
                    continue
                if worker.exit_for_enough_time():
                    # has exit for 1s and there were no normal handled requests
                    if not worker.can_be_restart():
                        continue
                    logger.warning(
                        f"detect worker process has exited, try to restart, servable: {worker.to_string()}")
                    worker.restart_worker()
                alive_count += occupy_device_worker

            if not alive_count:
                logger.warning("Serving server begin to exit: all worker processes that occupy devices have exited")
                break

        _send_exit_signal_to_children(worker_list)
        stop()

    thread = threading.Thread(target=listening_thread_fun)
    thread.start()

    def join_thread():
        if thread != threading.current_thread():
            thread.join()
            return True
        return False

    at_stop_list.append(join_thread)