-
Notifications
You must be signed in to change notification settings - Fork 0
<fix>[vm]: add MetadataImpact #3612
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| package org.zstack.compute.vm; | ||
|
|
||
| import org.zstack.core.cloudbus.CloudBusCallBack; | ||
| import org.zstack.core.gc.GC; | ||
| import org.zstack.core.gc.GCCompletion; | ||
| import org.zstack.core.gc.TimeBasedGarbageCollector; | ||
| import org.zstack.header.host.HostVO; | ||
| import org.zstack.header.message.MessageReply; | ||
| import org.zstack.header.storage.primary.CleanupVmInstanceMetadataOnPrimaryStorageMsg; | ||
| import org.zstack.header.storage.primary.PrimaryStorageConstant; | ||
| import org.zstack.header.storage.primary.PrimaryStorageVO; | ||
| import org.zstack.utils.Utils; | ||
| import org.zstack.utils.logging.CLogger; | ||
|
|
||
| public class CleanupVmInstanceMetadataOnPrimaryStorageGC extends TimeBasedGarbageCollector { | ||
| private static final CLogger logger = Utils.getLogger(CleanupVmInstanceMetadataOnPrimaryStorageGC.class); | ||
|
|
||
| @GC | ||
| public String primaryStorageUuid; | ||
| @GC | ||
| public String vmUuid; | ||
| @GC | ||
| public String rootVolumeUuid; | ||
| @GC | ||
| public String metadataPath; | ||
| @GC | ||
| public String hostUuid; | ||
|
|
||
| public static String getGCName(String vmUuid) { | ||
| return String.format("gc-cleanup-vm-metadata-%s", vmUuid); | ||
| } | ||
|
|
||
| @Override | ||
| protected void triggerNow(GCCompletion completion) { | ||
| if (!dbf.isExist(primaryStorageUuid, PrimaryStorageVO.class)) { | ||
| logger.debug(String.format("[MetadataCleanupGC] primary storage[uuid:%s] no longer exists, " + | ||
| "cancel gc for vm[uuid:%s]", primaryStorageUuid, vmUuid)); | ||
| completion.cancel(); | ||
| return; | ||
| } | ||
|
|
||
| if (hostUuid != null && !dbf.isExist(hostUuid, HostVO.class)) { | ||
| logger.debug(String.format("[MetadataCleanupGC] host[uuid:%s] no longer exists, " + | ||
| "cancel gc for vm[uuid:%s]", hostUuid, vmUuid)); | ||
| completion.cancel(); | ||
| return; | ||
| } | ||
|
|
||
| CleanupVmInstanceMetadataOnPrimaryStorageMsg msg = new CleanupVmInstanceMetadataOnPrimaryStorageMsg(); | ||
| msg.setPrimaryStorageUuid(primaryStorageUuid); | ||
| msg.setVmUuid(vmUuid); | ||
| msg.setRootVolumeUuid(rootVolumeUuid); | ||
| msg.setMetadataPath(metadataPath); | ||
| msg.setHostUuid(hostUuid); | ||
|
|
||
| bus.makeTargetServiceIdByResourceUuid(msg, PrimaryStorageConstant.SERVICE_ID, primaryStorageUuid); | ||
| bus.send(msg, new CloudBusCallBack(completion) { | ||
| @Override | ||
| public void run(MessageReply reply) { | ||
| if (reply.isSuccess()) { | ||
| logger.info(String.format("[MetadataCleanupGC] successfully cleaned up metadata " + | ||
| "for vm[uuid:%s] on ps[uuid:%s]", vmUuid, primaryStorageUuid)); | ||
| completion.success(); | ||
| } else { | ||
| logger.warn(String.format("[MetadataCleanupGC] failed to clean up metadata " + | ||
| "for vm[uuid:%s] on ps[uuid:%s]: %s", vmUuid, primaryStorageUuid, reply.getError())); | ||
| completion.fail(reply.getError()); | ||
| } | ||
| } | ||
| }); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,128 @@ | ||
| package org.zstack.compute.vm; | ||
|
|
||
| import org.springframework.beans.factory.annotation.Autowire; | ||
| import org.springframework.beans.factory.annotation.Autowired; | ||
| import org.springframework.beans.factory.annotation.Configurable; | ||
| import org.zstack.core.cloudbus.CloudBus; | ||
| import org.zstack.core.cloudbus.CloudBusCallBack; | ||
| import org.zstack.core.componentloader.PluginRegistry; | ||
| import org.zstack.core.db.Q; | ||
| import org.zstack.header.core.workflow.FlowTrigger; | ||
| import org.zstack.header.core.workflow.NoRollbackFlow; | ||
| import org.zstack.header.message.MessageReply; | ||
| import org.zstack.header.storage.primary.CleanupVmInstanceMetadataOnPrimaryStorageMsg; | ||
| import org.zstack.header.storage.primary.PrimaryStorageConstant; | ||
| import org.zstack.header.storage.primary.PrimaryStorageVO; | ||
| import org.zstack.header.storage.primary.PrimaryStorageVO_; | ||
| import org.zstack.header.vm.VmInstanceConstant; | ||
| import org.zstack.header.vm.VmInstanceSpec; | ||
| import org.zstack.header.vm.metadata.VmMetadataPathBuildExtensionPoint; | ||
| import org.zstack.header.volume.VolumeInventory; | ||
| import org.zstack.utils.Utils; | ||
| import org.zstack.utils.logging.CLogger; | ||
|
|
||
| import java.util.Map; | ||
| import java.util.concurrent.TimeUnit; | ||
|
|
||
| @Configurable(preConstruction = true, autowire = Autowire.BY_TYPE) | ||
| public class VmExpungeMetadataFlow extends NoRollbackFlow { | ||
| private static final CLogger logger = Utils.getLogger(VmExpungeMetadataFlow.class); | ||
|
|
||
| @Autowired | ||
| private CloudBus bus; | ||
| @Autowired | ||
| private PluginRegistry pluginRgty; | ||
|
|
||
| @Override | ||
| public void run(FlowTrigger trigger, Map data) { | ||
| final VmInstanceSpec spec = (VmInstanceSpec) data.get(VmInstanceConstant.Params.VmInstanceSpec.toString()); | ||
| if (spec == null || spec.getVmInventory() == null) { | ||
| logger.warn("[MetadataExpunge] missing VmInstanceSpec or VmInventory, skip metadata cleanup"); | ||
| trigger.next(); | ||
| return; | ||
| } | ||
|
|
||
| final String vmUuid = spec.getVmInventory().getUuid(); | ||
|
|
||
| VolumeInventory rootVolume = spec.getVmInventory().getRootVolume(); | ||
| String psUuid = rootVolume != null ? rootVolume.getPrimaryStorageUuid() : null; | ||
| if (psUuid == null) { | ||
| logger.debug(String.format("[MetadataExpunge] vm[uuid:%s] root volume has no primaryStorageUuid, " + | ||
| "skipping metadata cleanup", vmUuid)); | ||
| trigger.next(); | ||
| return; | ||
| } | ||
|
|
||
|
|
||
| String psType = Q.New(PrimaryStorageVO.class).select(PrimaryStorageVO_.type).eq(PrimaryStorageVO_.uuid, psUuid).findValue(); | ||
| if (psType == null) { | ||
| logger.warn(String.format("[MetadataExpunge] primary storage[uuid:%s] not found for vm[uuid:%s], " + | ||
| "skip metadata cleanup", psUuid, vmUuid)); | ||
| trigger.next(); | ||
| return; | ||
| } | ||
|
|
||
| VmMetadataPathBuildExtensionPoint ext = pluginRgty.getExtensionFromMap(psType, VmMetadataPathBuildExtensionPoint.class); | ||
| if (ext == null) { | ||
| logger.warn(String.format("[MetadataExpunge] no VmMetadataPathBuildExtensionPoint found for ps[uuid:%s, type:%s], " + | ||
| "skip metadata cleanup", psUuid, psType)); | ||
| trigger.next(); | ||
| return; | ||
| } | ||
| final String metadataPath; | ||
| try { | ||
| metadataPath = ext.buildVmMetadataPath(psUuid, vmUuid); | ||
| } catch (Exception e) { | ||
| logger.warn(String.format("[MetadataExpunge] failed to build metadata path for vm[uuid:%s] on ps[uuid:%s], " + | ||
| "skip metadata cleanup: %s", vmUuid, psUuid, e.getMessage())); | ||
| trigger.next(); | ||
| return; | ||
| } | ||
|
Comment on lines
+73
to
+80
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# 检查 buildVmMetadataPath 实现是否依赖数据库查询
ast-grep --pattern $'buildVmMetadataPath($_, $_) {
$$$
}'Repository: MatheMatrix/zstack Length of output: 44 🏁 Script executed: # 查找 VmExpungeMetadataFlow.java 文件并检查 ext 对象的类型
find . -name "VmExpungeMetadataFlow.java" -type fRepository: MatheMatrix/zstack Length of output: 134 🏁 Script executed: # 搜索 buildVmMetadataPath 方法定义
rg "buildVmMetadataPath" -A 5 -B 2Repository: MatheMatrix/zstack Length of output: 4443 🏁 Script executed: # 查找流程顺序配置
fd "VmInstanceManager.xml" -type fRepository: MatheMatrix/zstack Length of output: 232 🏁 Script executed: # 搜索 VmInstanceManager.xml 获取流程顺序
find . -name "*.xml" | xargs grep -l "VmExpungeMetadataFlow\|VmExpungeRootVolumeFlow" 2>/dev/null | head -5Repository: MatheMatrix/zstack Length of output: 106 🏁 Script executed: # 查看 VmExpungeMetadataFlow.java 的完整内容
cat -n ./compute/src/main/java/org/zstack/compute/vm/VmExpungeMetadataFlow.javaRepository: MatheMatrix/zstack Length of output: 6889 🏁 Script executed: # 搜索 VmExpungeRootVolumeFlow
rg "class VmExpungeRootVolumeFlow" -A 30Repository: MatheMatrix/zstack Length of output: 3798 🏁 Script executed: cat ./conf/springConfigXml/VmInstanceManager.xml | grep -A 5 -B 5 "VmExpunge"Repository: MatheMatrix/zstack Length of output: 763 补充 GC 机制处理元数据路径构建失败的情况。
建议在第 75-80 行的异常处理中也提交 GC 任务,确保路径构建失败不会导致元数据被遗弃: } catch (Exception e) {
logger.warn(String.format("[MetadataExpunge] failed to build metadata path for vm[uuid:%s] on ps[uuid:%s], " +
"skip metadata cleanup: %s", vmUuid, psUuid, e.getMessage()));
// 提交 GC 任务以确保元数据最终被清理
submitGC(psUuid, vmUuid, rootVolumeUuid, metadataPath, hostUuid);
trigger.next();
return;
}或者考虑在异常捕获时只捕获特定的预期异常,而非泛型 Exception。 🤖 Prompt for AI Agents |
||
|
|
||
| String rootVolumeUuid = rootVolume.getUuid(); | ||
| CleanupVmInstanceMetadataOnPrimaryStorageMsg cmsg = new CleanupVmInstanceMetadataOnPrimaryStorageMsg(); | ||
| cmsg.setPrimaryStorageUuid(psUuid); | ||
| cmsg.setVmUuid(vmUuid); | ||
| cmsg.setMetadataPath(metadataPath); | ||
| cmsg.setRootVolumeUuid(rootVolumeUuid); | ||
|
|
||
| String hostUuid = spec.getVmInventory().getHostUuid(); | ||
| if (hostUuid == null) { | ||
| hostUuid = spec.getVmInventory().getLastHostUuid(); | ||
| } | ||
| cmsg.setHostUuid(hostUuid); | ||
|
|
||
| final String finalPsUuid = psUuid; | ||
| final String finalHostUuid = hostUuid; | ||
|
|
||
| bus.makeTargetServiceIdByResourceUuid(cmsg, PrimaryStorageConstant.SERVICE_ID, psUuid); | ||
| bus.send(cmsg, new CloudBusCallBack(trigger) { | ||
| @Override | ||
| public void run(MessageReply reply) { | ||
| if (reply.isSuccess()) { | ||
| logger.info(String.format("[MetadataExpunge] successfully deleted metadata for vm[uuid:%s] on ps[uuid:%s]", | ||
| vmUuid, finalPsUuid)); | ||
| } else { | ||
| logger.warn(String.format("[MetadataExpunge] failed to delete metadata for vm[uuid:%s] on ps[uuid:%s]: %s, " + | ||
| "submitting GC job for retry", vmUuid, finalPsUuid, reply.getError())); | ||
| submitGC(finalPsUuid, vmUuid, rootVolumeUuid, metadataPath, finalHostUuid); | ||
| } | ||
| trigger.next(); | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| private void submitGC(String psUuid, String vmUuid, String rootVolumeUuid, String metadataPath, String hostUuid) { | ||
| CleanupVmInstanceMetadataOnPrimaryStorageGC gc = new CleanupVmInstanceMetadataOnPrimaryStorageGC(); | ||
| gc.NAME = CleanupVmInstanceMetadataOnPrimaryStorageGC.getGCName(vmUuid); | ||
| gc.primaryStorageUuid = psUuid; | ||
| gc.vmUuid = vmUuid; | ||
| gc.rootVolumeUuid = rootVolumeUuid; | ||
| gc.metadataPath = metadataPath; | ||
| gc.hostUuid = hostUuid; | ||
| long gcIntervalSec = TimeUnit.HOURS.toSeconds(VmGlobalConfig.VM_METADATA_CLEANUP_GC_INTERVAL.value(Long.class)); | ||
| gc.deduplicateSubmit(gcIntervalSec, TimeUnit.SECONDS); | ||
|
Comment on lines
+102
to
+124
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 不要把所有清理失败都无条件转成 GC 重试。 这里对任意 🤖 Prompt for AI Agents |
||
|
|
||
| logger.info(String.format("[MetadataExpunge] submitted GC job [%s] for vm[uuid:%s] on ps[uuid:%s]", gc.NAME, vmUuid, psUuid)); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,14 +23,19 @@ | |
| import org.zstack.header.network.l3.*; | ||
| import org.zstack.header.storage.primary.PrimaryStorageClusterRefVO; | ||
| import org.zstack.header.storage.primary.PrimaryStorageClusterRefVO_; | ||
| import org.zstack.header.storage.primary.PrimaryStorageVO; | ||
| import org.zstack.header.storage.primary.PrimaryStorageVO_; | ||
| import org.zstack.header.storage.snapshot.VolumeSnapshotVO; | ||
| import org.zstack.header.storage.snapshot.VolumeSnapshotVO_; | ||
| import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupVO; | ||
| import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupVO_; | ||
| import org.zstack.header.vm.*; | ||
| import org.zstack.header.vm.cdrom.*; | ||
| import org.zstack.header.vm.metadata.APIRegisterVmInstanceFromMetadataMsg; | ||
| import org.zstack.header.vm.devices.VmInstanceResourceMetadataGroupVO; | ||
| import org.zstack.header.vm.devices.VmInstanceResourceMetadataGroupVO_; | ||
| import org.zstack.header.vm.metadata.VmInstanceMetadataConstants; | ||
| import org.zstack.header.vm.metadata.VmMetadataPathBuildExtensionPoint; | ||
| import org.zstack.header.volume.*; | ||
| import org.zstack.network.l2.L2NetworkHostUtils; | ||
| import org.zstack.resourceconfig.ResourceConfigFacade; | ||
|
|
@@ -166,6 +171,8 @@ else if (msg instanceof APIAttachVmNicToVmMsg) { | |
| validate((APIConvertTemplatedVmInstanceToVmInstanceMsg) msg); | ||
| } else if (msg instanceof APIDeleteTemplatedVmInstanceMsg) { | ||
| validate((APIDeleteTemplatedVmInstanceMsg) msg); | ||
| } else if (msg instanceof APIRegisterVmInstanceFromMetadataMsg) { | ||
| validate((APIRegisterVmInstanceFromMetadataMsg) msg); | ||
| } | ||
|
|
||
| if (msg instanceof NewVmInstanceMessage2) { | ||
|
|
@@ -1318,4 +1325,29 @@ private void validate(APIFstrimVmMsg msg) { | |
| } | ||
| msg.setHostUuid(t.get(1, String.class)); | ||
| } | ||
|
|
||
| private void validate(APIRegisterVmInstanceFromMetadataMsg msg) { | ||
| String path = msg.getMetadataPath(); | ||
| if (StringUtils.isEmpty(path)) { | ||
| throw new ApiMessageInterceptionException(argerr("metadataPath cannot be empty or null")); | ||
| } | ||
|
Comment on lines
+1330
to
+1333
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 把纯空白的 现在只拦了 ✍️ 建议修改 String path = msg.getMetadataPath();
- if (path == null || path.isEmpty()) {
+ if (path == null || path.trim().isEmpty()) {
throw new ApiMessageInterceptionException(argerr("metadataPath cannot be empty"));
}🤖 Prompt for AI Agents |
||
|
|
||
| // Delegate path validation to the storage-type-specific extension | ||
| String psUuid = msg.getPrimaryStorageUuid(); | ||
| String psType = Q.New(PrimaryStorageVO.class) | ||
| .select(PrimaryStorageVO_.type) | ||
| .eq(PrimaryStorageVO_.uuid, psUuid) | ||
| .findValue(); | ||
| VmMetadataPathBuildExtensionPoint ext = (psType != null) | ||
| ? pluginRgty.getExtensionFromMap(psType, VmMetadataPathBuildExtensionPoint.class) : null; | ||
| if (ext == null) { | ||
| throw new ApiMessageInterceptionException(argerr( | ||
| "primary storage[uuid:%s, type:%s] does not support vm metadata", psUuid, psType)); | ||
| } | ||
|
|
||
| String error = ext.validateMetadataPath(psUuid, path); | ||
| if (error != null) { | ||
| throw new ApiMessageInterceptionException(argerr(error)); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| CREATE TABLE IF NOT EXISTS `zstack`.`VmMetadataDirtyVO` ( | ||
| `vmInstanceUuid` VARCHAR(32) NOT NULL, | ||
| `managementNodeUuid` VARCHAR(32) DEFAULT NULL, | ||
| `dirtyVersion` BIGINT NOT NULL DEFAULT 1, | ||
| `lastClaimTime` TIMESTAMP NULL DEFAULT NULL, | ||
| `storageStructureChange` TINYINT(1) NOT NULL DEFAULT 0, | ||
| `retryCount` INT NOT NULL DEFAULT 0, | ||
| `nextRetryTime` TIMESTAMP NULL DEFAULT NULL, | ||
| `lastOpDate` timestamp on update CURRENT_TIMESTAMP, | ||
| `createDate` timestamp NOT NULL DEFAULT '1999-12-31 23:59:59', | ||
| PRIMARY KEY (`vmInstanceUuid`), | ||
| CONSTRAINT `fkVmMetadataDirtyVOVmInstanceEO` FOREIGN KEY (`vmInstanceUuid`) REFERENCES `VmInstanceEO` (`uuid`) ON DELETE CASCADE, | ||
| CONSTRAINT `fkVmMetadataDirtyVOManagementNodeVO` FOREIGN KEY (`managementNodeUuid`) REFERENCES `ManagementNodeVO` (`uuid`) ON DELETE SET NULL | ||
|
Comment on lines
+12
to
+13
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 外键引用建议显式带上 这个目录里的升级脚本约定是固定使用 🛠️ 建议修改- CONSTRAINT `fkVmMetadataDirtyVOVmInstanceEO` FOREIGN KEY (`vmInstanceUuid`) REFERENCES `VmInstanceEO` (`uuid`) ON DELETE CASCADE,
- CONSTRAINT `fkVmMetadataDirtyVOManagementNodeVO` FOREIGN KEY (`managementNodeUuid`) REFERENCES `ManagementNodeVO` (`uuid`) ON DELETE SET NULL
+ CONSTRAINT `fkVmMetadataDirtyVOVmInstanceEO` FOREIGN KEY (`vmInstanceUuid`) REFERENCES `zstack`.`VmInstanceEO` (`uuid`) ON DELETE CASCADE,
+ CONSTRAINT `fkVmMetadataDirtyVOManagementNodeVO` FOREIGN KEY (`managementNodeUuid`) REFERENCES `zstack`.`ManagementNodeVO` (`uuid`) ON DELETE SET NULL
...
- CONSTRAINT `fkVmMetadataFingerprintVOVmInstanceEO` FOREIGN KEY (`vmInstanceUuid`) REFERENCES `VmInstanceEO` (`uuid`) ON DELETE CASCADE
+ CONSTRAINT `fkVmMetadataFingerprintVOVmInstanceEO` FOREIGN KEY (`vmInstanceUuid`) REFERENCES `zstack`.`VmInstanceEO` (`uuid`) ON DELETE CASCADEBased on learnings, in Also applies to: 25-25 🤖 Prompt for AI Agents |
||
| ) ENGINE=InnoDB DEFAULT CHARSET=utf8; | ||
|
|
||
| CREATE TABLE IF NOT EXISTS `zstack`.`VmMetadataFingerprintVO` ( | ||
| `vmInstanceUuid` VARCHAR(32) NOT NULL, | ||
| `metadataSnapshot` LONGTEXT, | ||
| `lastFlushTime` TIMESTAMP NULL DEFAULT NULL, | ||
| `lastFlushFailed` TINYINT(1) NOT NULL DEFAULT 0, | ||
| `staleRecoveryCount` INT NOT NULL DEFAULT 0, | ||
| `lastOpDate` timestamp on update CURRENT_TIMESTAMP, | ||
| `createDate` timestamp NOT NULL DEFAULT '1999-12-31 23:59:59', | ||
| PRIMARY KEY (`vmInstanceUuid`), | ||
| CONSTRAINT `fkVmMetadataFingerprintVOVmInstanceEO` FOREIGN KEY (`vmInstanceUuid`) REFERENCES `VmInstanceEO` (`uuid`) ON DELETE CASCADE | ||
| ) ENGINE=InnoDB DEFAULT CHARSET=utf8; | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
GC 未区分瞬态与永久性错误,可能导致无限重试。
当
completion.fail()被调用时,TimeBasedGarbageCollector会重新调度 GC 任务。如果错误是永久性的(如元数据路径无效、权限问题等),GC 会无限重试。建议:
completion.cancel()而非completion.fail()🛠️ 建议修改
} else { logger.warn(String.format("[MetadataCleanupGC] failed to clean up metadata " + "for vm[uuid:%s] on ps[uuid:%s]: %s", vmUuid, primaryStorageUuid, reply.getError())); - completion.fail(reply.getError()); + // 对于永久性错误(如路径不存在),取消 GC 而非重试 + if (reply.getError().isError(VmMetadataErrors.METADATA_NOT_FOUND) + || reply.getError().isError(PrimaryStorageErrors.PRIMARY_STORAGE_NOT_FOUND)) { + logger.info(String.format("[MetadataCleanupGC] cancel gc for vm[uuid:%s] due to permanent error: %s", + vmUuid, reply.getError().getCode())); + completion.cancel(); + } else { + completion.fail(reply.getError()); + } }🤖 Prompt for AI Agents