nova云主机evacuate简单分析

来源：互联网发布：c语言代码在线运行编辑：程序博客网时间：2024/05/29 16:51

本文基于openstack Newton版本

简单说明

云主机只支持状态为active，stopped，error的云主机进行救援，所以如果是其他状态的云主机需要evacuate，则需要重置状态。

简要流程：

1、校验云主机所在物理主机状态为down，并且云主机状态为active,stopped,error才能执行evacuate并创建该云主机的迁移任务

2、由nova.api发送rpc到nova.conductor进行rebuild_instance

3、nova.conductor接收到rebuild_instance的rpc请求后，调用nova.scheduler选择资源足够的物理主机B，并获取该云主机的迁移任务，从该任务中，获取云主机相关的配置信息

4、由nova.conductor指定被选中的物理主机B进行rebuild_instance

5、nova.compute接收到rebuild_instance的rpc消息后

重新设置该云主机所属网络端口的所属物理主机，

将云主机的云硬盘从老的物理主机上迁移到物理主机B上（先卸载再挂载）

最终启动云主机

代码入口

代码位于nova/api/compute/evacuate.py中

# TODO(eliqiao): Should be responding here with 202 Accept# because evacuate is an async call, but keep to 200 for# backwards compatibility reasons.@extensions.expected_errors((400, 404, 409))@wsgi.action('evacuate')@validation.schema(evacuate.evacuate, "2.1", "2.12")@validation.schema(evacuate.evacuate_v214, "2.14", "2.28")@validation.schema(evacuate.evacuate_v2_29, "2.29")def _evacuate(self, req, id, body):    """Permit admins to evacuate a server from a failed host    to a new one.    """    context = req.environ["nova.context"]    instance = common.get_instance(self.compute_api, context, id)    context.can(evac_policies.BASE_POLICY_NAME,                target={'user_id': instance.user_id,                        'project_id': instance.project_id})    evacuate_body = body["evacuate"]    host = evacuate_body.get("host")    force = None    on_shared_storage = self._get_on_shared_storage(req, evacuate_body)    if api_version_request.is_supported(req, min_version='2.29'):        force = body["evacuate"].get("force", False)        force = strutils.bool_from_string(force, strict=True)        if force is True and not host:            message = _("Can't force to a non-provided destination")            raise exc.HTTPBadRequest(explanation=message)    if api_version_request.is_supported(req, min_version='2.14'):        password = self._get_password_v214(req, evacuate_body)    else:        password = self._get_password(req, evacuate_body,                                      on_shared_storage)    if host is not None:        try:            self.host_api.service_get_by_compute_host(context, host)        except exception.ComputeHostNotFound:            msg = _("Compute host %s not found.") % host            raise exc.HTTPNotFound(explanation=msg)    if instance.host == host:        msg = _("The target host can't be the same one.")        raise exc.HTTPBadRequest(explanation=msg)    try:        self.compute_api.evacuate(context, instance, host,                                  on_shared_storage, password, force)    except exception.InstanceUnknownCell as e:        raise exc.HTTPNotFound(explanation=e.format_message())    except exception.InstanceInvalidState as state_error:        common.raise_http_conflict_for_instance_invalid_state(state_error,                'evacuate', id)    except exception.ComputeServiceInUse as e:        raise exc.HTTPBadRequest(explanation=e.format_message())    if (not api_version_request.is_supported(req, min_version='2.14') and            CONF.enable_instance_password):        return {'adminPass': password}    else:        return None

Evacuate校验

云主机只支持状态为active，stopped，error的云主机进行救援

代码位于nova/compute/api.py中。

1、只有nova-compute服务状态为down的情况下，才支持evacuate

@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,                                vm_states.ERROR])def evacuate(self, context, instance, host, on_shared_storage,             admin_password=None, force=None):    """Running evacuate to target host.    Checking vm compute host state, if the host not in expected_state,    raising an exception.    :param instance: The instance to evacuate    :param host: Target host. if not set, the scheduler will pick up one    :param on_shared_storage: True if instance files on shared storage    :param admin_password: password to set on rebuilt instance    :param force: Force the evacuation to the specific host target    """    LOG.debug('vm evacuation scheduled', instance=instance)    inst_host = instance.host    service = objects.Service.get_by_compute_host(context, inst_host)#只有nova-compute服务状态为down的情况下，才支持evacuate    if self.servicegroup_api.service_is_up(service):        LOG.error(_LE('Instance compute service state on %s '                      'expected to be down, but it was up.'), inst_host)        raise exception.ComputeServiceInUse(host=inst_host)    instance.task_state = task_states.REBUILDING    instance.save(expected_task_state=[None])    self._record_action_start(context, instance, instance_actions.EVACUATE)    # NOTE(danms): Create this as a tombstone for the source compute    # to find and cleanup. No need to pass it anywhere else.    migration = objects.Migration(context,                                  source_compute=instance.host,                                  source_node=instance.node,                                  instance_uuid=instance.uuid,                                  status='accepted',                                  migration_type='evacuation')    if host:        migration.dest_compute = host    migration.create()    compute_utils.notify_about_instance_usage(        self.notifier, context, instance, "evacuate")    try:        request_spec = objects.RequestSpec.get_by_instance_uuid(            context, instance.uuid)    except exception.RequestSpecNotFound:        # Some old instances can still have no RequestSpec object attached        # to them, we need to support the old way        request_spec = None    # NOTE(sbauza): Force is a boolean by the new related API version    if force is False and host:        nodes = objects.ComputeNodeList.get_all_by_host(context, host)        if not nodes:            raise exception.ComputeHostNotFound(host=host)        # NOTE(sbauza): Unset the host to make sure we call the scheduler        host = None        # FIXME(sbauza): Since only Ironic driver uses more than one        # compute per service but doesn't support evacuations,        # let's provide the first one.        target = nodes[0]        if request_spec:            # TODO(sbauza): Hydrate a fake spec for old instances not yet            # having a request spec attached to them (particularly true for            # cells v1). For the moment, let's keep the same behaviour for            # all the instances but provide the destination only if a spec            # is found.            destination = objects.Destination(                host=target.host,                node=target.hypervisor_hostname            )            request_spec.requested_destination = destination    return self.compute_task_api.rebuild_instance(context,                                                  instance=instance,                                                  new_pass=admin_password,                                                  injected_files=None,                                                  image_ref=None,                                                  orig_image_ref=None,                                                  orig_sys_metadata=None,                                                  bdms=None,                                                  recreate=True,                                                  on_shared_storage=on_shared_storage,                                                  host=host,                                                  request_spec=request_spec,                                                  )

走conductor进行rebuild_instance

先通过conductor.api.rebuild_instance走到conductor.rpcapi.rebuild_instance，最终走到conductor.manager.rebuild_instance，中间基本上是参数透传，这里不再说明。

代码位于nova/compute/conductor/manager.py中

def rebuild_instance(self, context, instance, orig_image_ref, image_ref,                     injected_files, new_pass, orig_sys_metadata,                     bdms, recreate, on_shared_storage,                     preserve_ephemeral=False, host=None,                     request_spec=None):    with compute_utils.EventReporter(context, 'rebuild_server',                                      instance.uuid):        node = limits = None        if not host:            if not request_spec:                # NOTE(sbauza): We were unable to find an original                # RequestSpec object - probably because the instance is old                # We need to mock that the old way                filter_properties = {'ignore_hosts': [instance.host]}                request_spec = scheduler_utils.build_request_spec(                        context, image_ref, [instance])            else:                # NOTE(sbauza): Augment the RequestSpec object by excluding                # the source host for avoiding the scheduler to pick it                request_spec.ignore_hosts = request_spec.ignore_hosts or []                request_spec.ignore_hosts.append(instance.host)                # NOTE(sbauza): Force_hosts/nodes needs to be reset                # if we want to make sure that the next destination                # is not forced to be the original host                request_spec.reset_forced_destinations()                # TODO(sbauza): Provide directly the RequestSpec object                # when _schedule_instances() and _set_vm_state_and_notify()                # accept it                filter_properties = request_spec.\                    to_legacy_filter_properties_dict()                request_spec = request_spec.to_legacy_request_spec_dict()            try:#调用nova.secheduler选择可用的物理主机                hosts = self._schedule_instances(                        context, request_spec, filter_properties)                host_dict = hosts.pop(0)                host, node, limits = (host_dict['host'],                                      host_dict['nodename'],                                      host_dict['limits'])            except exception.NoValidHost as ex:                with excutils.save_and_reraise_exception():                    self._set_vm_state_and_notify(context, instance.uuid,                            'rebuild_server',                            {'vm_state': instance.vm_state,                             'task_state': None}, ex, request_spec)                    LOG.warning(_LW("No valid host found for rebuild"),                                instance=instance)            except exception.UnsupportedPolicyException as ex:                with excutils.save_and_reraise_exception():                    self._set_vm_state_and_notify(context, instance.uuid,                            'rebuild_server',                            {'vm_state': instance.vm_state,                             'task_state': None}, ex, request_spec)                    LOG.warning(_LW("Server with unsupported policy "                                    "cannot be rebuilt"),                                instance=instance)        try:# 获取迁移的云主机信息            migration = objects.Migration.get_by_instance_and_status(                context, instance.uuid, 'accepted')        except exception.MigrationNotFoundByStatus:            LOG.debug("No migration record for the rebuild/evacuate "                      "request.", instance=instance)            migration = None        compute_utils.notify_about_instance_usage(            self.notifier, context, instance, "rebuild.scheduled")#指定被选中的物理主机进行云主机迁移        self.compute_rpcapi.rebuild_instance(context,                instance=instance,                new_pass=new_pass,                injected_files=injected_files,                image_ref=image_ref,                orig_image_ref=orig_image_ref,                orig_sys_metadata=orig_sys_metadata,                bdms=bdms,                recreate=recreate,                on_shared_storage=on_shared_storage,                preserve_ephemeral=preserve_ephemeral,                migration=migration,                host=host, node=node, limits=limits)

指定物理主机rebuild_instance

先通过compute.rpcapi.rebuild_instance，最终走到compute.manager.rebuild_instance，中间基本上是参数透传，这里不再说明。

代码位于nova/compute/manager.py

这里主要是重新设置云主机的网络端口所属物理主机，以及将云主机的云硬盘先从老的节点上卸载，并挂载到该节点上，最终启动云主机。

@messaging.expected_exceptions(exception.PreserveEphemeralNotSupported)@wrap_exception()@reverts_task_state@wrap_instance_event(prefix='compute')@wrap_instance_faultdef rebuild_instance(self, context, instance, orig_image_ref, image_ref,                     injected_files, new_pass, orig_sys_metadata,                     bdms, recreate, on_shared_storage=None,                     preserve_ephemeral=False, migration=None,                     scheduled_node=None, limits=None):    """Destroy and re-make this instance.    A 'rebuild' effectively purges all existing data from the system and    remakes the VM with given 'metadata' and 'personalities'.    :param context: `nova.RequestContext` object    :param instance: Instance object    :param orig_image_ref: Original image_ref before rebuild    :param image_ref: New image_ref for rebuild    :param injected_files: Files to inject    :param new_pass: password to set on rebuilt instance    :param orig_sys_metadata: instance system metadata from pre-rebuild    :param bdms: block-device-mappings to use for rebuild    :param recreate: True if the instance is being recreated (e.g. the        hypervisor it was on failed) - cleanup of old state will be        skipped.    :param on_shared_storage: True if instance files on shared storage.                              If not provided then information from the                              driver will be used to decide if the instance                              files are available or not on the target host    :param preserve_ephemeral: True if the default ephemeral storage                               partition must be preserved on rebuild    :param migration: a Migration object if one was created for this                      rebuild operation (if it's a part of evacuate)    :param scheduled_node: A node of the host chosen by the scheduler. If a                           host was specified by the user, this will be                           None    :param limits: Overcommit limits set by the scheduler. If a host was                   specified by the user, this will be None    """    context = context.elevated()    LOG.info(_LI("Rebuilding instance"), context=context,                instance=instance)    if scheduled_node is not None:        rt = self._get_resource_tracker(scheduled_node)        rebuild_claim = rt.rebuild_claim    else:        rebuild_claim = claims.NopClaim    image_meta = {}    if image_ref:        image_meta = self.image_api.get(context, image_ref)    # NOTE(mriedem): On a recreate (evacuate), we need to update    # the instance's host and node properties to reflect it's    # destination node for the recreate.    if not scheduled_node:        try:            compute_node = self._get_compute_info(context, self.host)            scheduled_node = compute_node.hypervisor_hostname        except exception.ComputeHostNotFound:            LOG.exception(_LE('Failed to get compute_info for %s'),                            self.host)    with self._error_out_instance_on_exception(context, instance):        try:            claim_ctxt = rebuild_claim(                context, instance, limits=limits, image_meta=image_meta,                migration=migration)            self._do_rebuild_instance_with_claim(                claim_ctxt, context, instance, orig_image_ref,                image_ref, injected_files, new_pass, orig_sys_metadata,                bdms, recreate, on_shared_storage, preserve_ephemeral)        except exception.ComputeResourcesUnavailable as e:            LOG.debug("Could not rebuild instance on this host, not "                      "enough resources available.", instance=instance)            # NOTE(ndipanov): We just abort the build for now and leave a            # migration record for potential cleanup later            self._set_migration_status(migration, 'failed')            self._notify_about_instance_usage(context, instance,                    'rebuild.error', fault=e)            raise exception.BuildAbortException(                instance_uuid=instance.uuid, reason=e.format_message())        except (exception.InstanceNotFound,                exception.UnexpectedDeletingTaskStateError) as e:            LOG.debug('Instance was deleted while rebuilding',                      instance=instance)            self._set_migration_status(migration, 'failed')            self._notify_about_instance_usage(context, instance,                    'rebuild.error', fault=e)        except Exception as e:            self._set_migration_status(migration, 'failed')            self._notify_about_instance_usage(context, instance,                    'rebuild.error', fault=e)            raise        else:            instance.apply_migration_context()            # NOTE (ndipanov): This save will now update the host and node            # attributes making sure that next RT pass is consistent since            # it will be based on the instance and not the migration DB            # entry.            instance.host = self.host            instance.node = scheduled_node            instance.save()            instance.drop_migration_context()            # NOTE (ndipanov): Mark the migration as done only after we            # mark the instance as belonging to this host.            self._set_migration_status(migration, 'done')

def _do_rebuild_instance_with_claim(self, claim_context, *args, **kwargs):    """Helper to avoid deep nesting in the top-level method."""    with claim_context:        self._do_rebuild_instance(*args, **kwargs)

def _do_rebuild_instance(self, context, instance, orig_image_ref,                         image_ref, injected_files, new_pass,                         orig_sys_metadata, bdms, recreate,                         on_shared_storage, preserve_ephemeral):    orig_vm_state = instance.vm_state    if recreate:        if not self.driver.capabilities["supports_recreate"]:            raise exception.InstanceRecreateNotSupported        self._check_instance_exists(context, instance)#这里会根据后端存储类型，自动判断存储是否支持共享        if on_shared_storage is None:            LOG.debug('on_shared_storage is not provided, using driver'                        'information to decide if the instance needs to'                        'be recreated')            on_shared_storage = self.driver.instance_on_disk(instance)        elif (on_shared_storage !=                self.driver.instance_on_disk(instance)):            # To cover case when admin expects that instance files are            # on shared storage, but not accessible and vice versa            raise exception.InvalidSharedStorage(                    _("Invalid state of instance files on shared"                        " storage"))        if on_shared_storage:            LOG.info(_LI('disk on shared storage, recreating using'                            ' existing disk'))        else:            image_ref = orig_image_ref = instance.image_ref            LOG.info(_LI("disk not on shared storage, rebuilding from:"                            " '%s'"), str(image_ref))    if image_ref:        image_meta = objects.ImageMeta.from_image_ref(            context, self.image_api, image_ref)    else:        image_meta = instance.image_meta    # This instance.exists message should contain the original    # image_ref, not the new one.  Since the DB has been updated    # to point to the new one... we have to override it.    # TODO(jaypipes): Move generate_image_url() into the nova.image.api    orig_image_ref_url = glance.generate_image_url(orig_image_ref)    extra_usage_info = {'image_ref_url': orig_image_ref_url}    compute_utils.notify_usage_exists(            self.notifier, context, instance,            current_period=True, system_metadata=orig_sys_metadata,            extra_usage_info=extra_usage_info)    # This message should contain the new image_ref    extra_usage_info = {'image_name': self._get_image_name(image_meta)}    self._notify_about_instance_usage(context, instance,            "rebuild.start", extra_usage_info=extra_usage_info)    instance.power_state = self._get_power_state(context, instance)    instance.task_state = task_states.REBUILDING    instance.save(expected_task_state=[task_states.REBUILDING])    if recreate:        self.network_api.setup_networks_on_host(                context, instance, self.host)        # For nova-network this is needed to move floating IPs        # For neutron this updates the host in the port binding        # TODO(cfriesen): this network_api call and the one above        # are so similar, we should really try to unify them.        self.network_api.setup_instance_network_on_host(                context, instance, self.host)    network_info = compute_utils.get_nw_info_for_instance(instance)    if bdms is None:        bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(                context, instance.uuid)    block_device_info = \        self._get_instance_block_device_info(                context, instance, bdms=bdms)    def detach_block_devices(context, bdms):        for bdm in bdms:            if bdm.is_volume:                self._detach_volume(context, bdm.volume_id, instance,                                    destroy_bdm=False)    files = self._decode_files(injected_files)    kwargs = dict(        context=context,        instance=instance,        image_meta=image_meta,        injected_files=files,        admin_password=new_pass,        bdms=bdms,        detach_block_devices=detach_block_devices,        attach_block_devices=self._prep_block_device,        block_device_info=block_device_info,        network_info=network_info,        preserve_ephemeral=preserve_ephemeral,        recreate=recreate)    try:        with instance.mutated_migration_context():            self.driver.rebuild(**kwargs)    except NotImplementedError:        # NOTE(rpodolyaka): driver doesn't provide specialized version        # of rebuild, fall back to the default implementation        self._rebuild_default_impl(**kwargs)    self._update_instance_after_spawn(context, instance)    instance.save(expected_task_state=[task_states.REBUILD_SPAWNING])    if orig_vm_state == vm_states.STOPPED:        LOG.info(_LI("bringing vm to original state: '%s'"),                    orig_vm_state, instance=instance)        instance.vm_state = vm_states.ACTIVE        instance.task_state = task_states.POWERING_OFF        instance.progress = 0        instance.save()        self.stop_instance(context, instance, False)    self._update_scheduler_instance_info(context, instance)    self._notify_about_instance_usage(            context, instance, "rebuild.end",            network_info=network_info,            extra_usage_info=extra_usage_info)

阅读全文

0 0