Merge tag 'kvm-x86-svm-6.19' of https://github.com/kvm-x86/linux into HEAD

KVM SVM changes for 6.19:

 - Fix a few missing "VMCB dirty" bugs.

 - Fix the worst of KVM's lack of EFER.LMSLE emulation.

 - Add AVIC support for addressing 4k vCPUs in x2AVIC mode.

 - Fix incorrect handling of selective CR0 writes when checking intercepts
   during emulation of L2 instructions.

 - Fix a currently-benign bug where KVM would clobber SPEC_CTRL[63:32] on
   VMRUN and #VMEXIT.

 - Fix a bug where KVM corrupt the guest code stream when re-injecting a soft
   interrupt if the guest patched the underlying code after the VM-Exit, e.g.
   when Linux patches code with a temporary INT3.

 - Add KVM_X86_SNP_POLICY_BITS to advertise supported SNP policy bits to
   userspace, and extend KVM "support" to all policy bits that don't require
   any actual support from KVM.
This commit is contained in:
Paolo Bonzini
2025-11-26 09:46:45 +01:00
16 changed files with 314 additions and 89 deletions

View File

@@ -48,7 +48,14 @@ versus "has_error_code", i.e. KVM's ABI follows AMD behavior.
Nested virtualization features
------------------------------
TBD
On AMD CPUs, when GIF is cleared, #DB exceptions or traps due to a breakpoint
register match are ignored and discarded by the CPU. The CPU relies on the VMM
to fully virtualize this behavior, even when vGIF is enabled for the guest
(i.e. vGIF=0 does not cause the CPU to drop #DBs when the guest is running).
KVM does not virtualize this behavior as the complexity is unjustified given
the rarity of the use case. One way to handle this would be for KVM to
intercept the #DB, temporarily disable the breakpoint, single-step over the
instruction, then re-enable the breakpoint.
x2APIC
------

View File

@@ -338,6 +338,7 @@
#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */
#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/
#define X86_FEATURE_EFER_LMSLE_MBZ (13*32+20) /* EFER.LMSLE must be zero */
#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */
#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */
@@ -504,6 +505,7 @@
* can access host MMIO (ignored for all intents
* and purposes if CLEAR_CPU_BUF_VM is set).
*/
#define X86_FEATURE_X2AVIC_EXT (21*32+18) /* AMD SVM x2AVIC support for 4k vCPUs */
/*
* BUG word(s)

View File

@@ -2139,6 +2139,11 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
* the gfn, i.e. retrying the instruction will hit a
* !PRESENT fault, which results in a new shadow page
* and sends KVM back to square one.
*
* EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip
* an instruction if it could generate a given software
* interrupt, which must be encoded via
* EMULTYPE_SET_SOFT_INT_VECTOR().
*/
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
@@ -2149,6 +2154,10 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
#define EMULTYPE_PF (1 << 6)
#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
#define EMULTYPE_WRITE_PF_TO_SP (1 << 8)
#define EMULTYPE_SKIP_SOFT_INT (1 << 9)
#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16)
#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff)
static inline bool kvm_can_emulate_event_vectoring(int emul_type)
{

View File

@@ -279,7 +279,7 @@ enum avic_ipi_failure_cause {
AVIC_IPI_FAILURE_INVALID_IPI_VECTOR,
};
#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0)
#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(11, 0)
/*
* For AVIC, the max index allowed for physical APIC ID table is 0xfe (254), as
@@ -289,11 +289,14 @@ enum avic_ipi_failure_cause {
/*
* For x2AVIC, the max index allowed for physical APIC ID table is 0x1ff (511).
* With X86_FEATURE_X2AVIC_EXT, the max index is increased to 0xfff (4095).
*/
#define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL
#define X2AVIC_4K_MAX_PHYSICAL_ID 0xFFFUL
static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID);
static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
static_assert((X2AVIC_4K_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_4K_MAX_PHYSICAL_ID);
#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)

View File

@@ -502,6 +502,7 @@ struct kvm_sync_regs {
/* vendor-specific groups and attributes for system fd */
#define KVM_X86_GRP_SEV 1
# define KVM_X86_SEV_VMSA_FEATURES 0
# define KVM_X86_SNP_POLICY_BITS 1
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];

View File

@@ -49,6 +49,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 },
{ X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 },
{ X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
{ X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },

View File

@@ -1135,6 +1135,7 @@ void kvm_set_cpu_caps(void)
F(AMD_STIBP),
F(AMD_STIBP_ALWAYS_ON),
F(AMD_IBRS_SAME_MODE),
PASSTHROUGH_F(EFER_LMSLE_MBZ),
F(AMD_PSFD),
F(AMD_IBPB_RET),
);

View File

@@ -106,7 +106,7 @@ static u32 next_vm_id = 0;
static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
static bool x2avic_enabled;
static u32 x2avic_max_physical_id;
static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
bool intercept)
@@ -158,12 +158,40 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
svm->x2avic_msrs_intercepted = intercept;
}
static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
u32 arch_max;
/*
* Return the largest size (x2APIC) when querying without a vCPU, e.g.
* to allocate the per-VM table..
*/
if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic)))
arch_max = x2avic_max_physical_id;
else
arch_max = AVIC_MAX_PHYSICAL_ID;
/*
* Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID
* plus one, so the max possible APIC ID is one less than that.
*/
return min(kvm->arch.max_vcpu_ids - 1, arch_max);
}
static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
{
return __avic_get_max_physical_id(vcpu->kvm, vcpu);
}
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
struct kvm_vcpu *vcpu = &svm->vcpu;
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
@@ -176,7 +204,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
*/
if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
/* Disabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, false);
} else {
@@ -186,8 +214,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
/* For xAVIC and hybrid-xAVIC modes */
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
/* Enabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, true);
}
@@ -247,6 +273,30 @@ static int avic_ga_log_notifier(u32 ga_tag)
return 0;
}
static int avic_get_physical_id_table_order(struct kvm *kvm)
{
/* Provision for the maximum physical ID supported in x2avic mode */
return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64));
}
int avic_alloc_physical_id_table(struct kvm *kvm)
{
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
if (!irqchip_in_kernel(kvm) || !enable_apicv)
return 0;
if (kvm_svm->avic_physical_id_table)
return 0;
kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
avic_get_physical_id_table_order(kvm));
if (!kvm_svm->avic_physical_id_table)
return -ENOMEM;
return 0;
}
void avic_vm_destroy(struct kvm *kvm)
{
unsigned long flags;
@@ -256,7 +306,8 @@ void avic_vm_destroy(struct kvm *kvm)
return;
free_page((unsigned long)kvm_svm->avic_logical_id_table);
free_page((unsigned long)kvm_svm->avic_physical_id_table);
free_pages((unsigned long)kvm_svm->avic_physical_id_table,
avic_get_physical_id_table_order(kvm));
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
@@ -274,10 +325,6 @@ int avic_vm_init(struct kvm *kvm)
if (!enable_apicv)
return 0;
kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!kvm_svm->avic_physical_id_table)
goto free_avic;
kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!kvm_svm->avic_logical_id_table)
goto free_avic;
@@ -342,7 +389,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
* fully initialized AVIC.
*/
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
(id > X2AVIC_MAX_PHYSICAL_ID)) {
(id > x2avic_max_physical_id)) {
kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
vcpu->arch.apic->apicv_active = false;
return 0;
@@ -562,7 +609,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
u32 icrl = svm->vmcb->control.exit_info_1;
u32 id = svm->vmcb->control.exit_info_2 >> 32;
u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK;
struct kvm_lapic *apic = vcpu->arch.apic;
trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
@@ -962,7 +1009,8 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
return;
/*
@@ -1024,7 +1072,8 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
lockdep_assert_preemption_disabled();
if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
return;
/*
@@ -1226,10 +1275,15 @@ bool __init avic_hardware_setup(void)
/* AVIC is a prerequisite for x2AVIC. */
x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
if (x2avic_enabled)
pr_info("x2AVIC enabled\n");
else
if (x2avic_enabled) {
if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT))
x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID;
else
x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID;
pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1);
} else {
svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
}
/*
* Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)

View File

@@ -613,6 +613,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
struct kvm_vcpu *vcpu = &svm->vcpu;
nested_vmcb02_compute_g_pat(svm);
vmcb_mark_dirty(vmcb02, VMCB_NPT);
/* Load the nested guest state */
if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
@@ -751,6 +752,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
/*
* Stash vmcb02's counter if the guest hasn't moved past the guilty
@@ -1430,16 +1432,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
case SVM_EXIT_IOIO:
vmexit = nested_svm_intercept_ioio(svm);
break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
/*
* Host-intercepted exceptions have been checked already in

View File

@@ -65,20 +65,24 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04
#define AP_RESET_HOLD_NAE_EVENT 1
#define AP_RESET_HOLD_MSR_PROTO 2
/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
#define SNP_POLICY_MASK_SMT BIT_ULL(16)
#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
#define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
/*
* SEV-SNP policy bits that can be supported by KVM. These include policy bits
* that have implementation support within KVM or policy bits that do not
* require implementation support within KVM to enforce the policy.
*/
#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
SNP_POLICY_MASK_API_MAJOR | \
SNP_POLICY_MASK_SMT | \
SNP_POLICY_MASK_RSVD_MBO | \
SNP_POLICY_MASK_DEBUG | \
SNP_POLICY_MASK_SINGLE_SOCKET | \
SNP_POLICY_MASK_CXL_ALLOW | \
SNP_POLICY_MASK_MEM_AES_256_XTS | \
SNP_POLICY_MASK_RAPL_DIS | \
SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \
SNP_POLICY_MASK_PAGE_SWAP_DISABLE)
#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
SNP_POLICY_MASK_API_MAJOR | \
SNP_POLICY_MASK_SMT | \
SNP_POLICY_MASK_RSVD_MBO | \
SNP_POLICY_MASK_DEBUG | \
SNP_POLICY_MASK_SINGLE_SOCKET)
static u64 snp_supported_policy_bits __ro_after_init;
#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
@@ -2143,6 +2147,10 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
*val = sev_supported_vmsa_features;
return 0;
case KVM_X86_SNP_POLICY_BITS:
*val = snp_supported_policy_bits;
return 0;
default:
return -ENXIO;
}
@@ -2207,7 +2215,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (params.flags)
return -EINVAL;
if (params.policy & ~SNP_POLICY_MASK_VALID)
if (params.policy & ~snp_supported_policy_bits)
return -EINVAL;
/* Check for policy bits that must be set */
@@ -3100,8 +3108,11 @@ out:
else if (sev_snp_supported)
sev_snp_supported = is_sev_snp_initialized();
if (sev_snp_supported)
if (sev_snp_supported) {
snp_supported_policy_bits = sev_get_snp_policy_bits() &
KVM_SNP_POLICY_MASK_VALID;
nr_ciphertext_hiding_asids = init_args.max_snp_asid;
}
/*
* If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP
@@ -5085,10 +5096,10 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
/* Check if the SEV policy allows debugging */
if (sev_snp_guest(vcpu->kvm)) {
if (!(sev->policy & SNP_POLICY_DEBUG))
if (!(sev->policy & SNP_POLICY_MASK_DEBUG))
return NULL;
} else {
if (sev->policy & SEV_POLICY_NODBG)
if (sev->policy & SEV_POLICY_MASK_NODBG)
return NULL;
}

View File

@@ -272,6 +272,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
int emul_type,
bool commit_side_effects)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -293,7 +294,7 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
if (unlikely(!commit_side_effects))
old_rflags = svm->vmcb->save.rflags;
if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
if (!kvm_emulate_instruction(vcpu, emul_type))
return 0;
if (unlikely(!commit_side_effects))
@@ -311,11 +312,13 @@ done:
static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
return __svm_skip_emulated_instruction(vcpu, true);
return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true);
}
static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector)
{
const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT |
EMULTYPE_SET_SOFT_INT_VECTOR(vector);
unsigned long rip, old_rip = kvm_rip_read(vcpu);
struct vcpu_svm *svm = to_svm(vcpu);
@@ -331,7 +334,7 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
* in use, the skip must not commit any side effects such as clearing
* the interrupt shadow or RFLAGS.RF.
*/
if (!__svm_skip_emulated_instruction(vcpu, !nrips))
if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips))
return -EIO;
rip = kvm_rip_read(vcpu);
@@ -367,7 +370,7 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu)
kvm_deliver_exception_payload(vcpu, ex);
if (kvm_exception_is_soft(ex->vector) &&
svm_update_soft_interrupt_rip(vcpu))
svm_update_soft_interrupt_rip(vcpu, ex->vector))
return;
svm->vmcb->control.event_inj = ex->vector
@@ -1198,6 +1201,11 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
svm->vmcb = target_vmcb->ptr;
}
static int svm_vcpu_precreate(struct kvm *kvm)
{
return avic_alloc_physical_id_table(kvm);
}
static int svm_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
@@ -3628,11 +3636,12 @@ static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt;
struct vcpu_svm *svm = to_svm(vcpu);
u32 type;
if (vcpu->arch.interrupt.soft) {
if (svm_update_soft_interrupt_rip(vcpu))
if (intr->soft) {
if (svm_update_soft_interrupt_rip(vcpu, intr->nr))
return;
type = SVM_EVTINJ_TYPE_SOFT;
@@ -3640,12 +3649,10 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
type = SVM_EVTINJ_TYPE_INTR;
}
trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
vcpu->arch.interrupt.soft, reinjected);
trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
++vcpu->stat.irq_injections;
svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
SVM_EVTINJ_VALID | type;
svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
}
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
@@ -4511,31 +4518,45 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
case SVM_EXIT_WRITE_CR0: {
unsigned long cr0, val;
if (info->intercept == x86_intercept_cr_write)
/*
* Adjust the exit code accordingly if a CR other than CR0 is
* being written, and skip straight to the common handling as
* only CR0 has an additional selective intercept.
*/
if (info->intercept == x86_intercept_cr_write && info->modrm_reg) {
icpt_info.exit_code += info->modrm_reg;
if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
info->intercept == x86_intercept_clts)
break;
if (!(vmcb12_is_intercept(&svm->nested.ctl,
INTERCEPT_SELECTIVE_CR0)))
break;
cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
if (info->intercept == x86_intercept_lmsw) {
cr0 &= 0xfUL;
val &= 0xfUL;
/* lmsw can't clear PE - catch this here */
if (cr0 & X86_CR0_PE)
val |= X86_CR0_PE;
}
/*
* Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a
* selective CR0 intercept is triggered (the common logic will
* treat the selective intercept as being enabled). Note, the
* unconditional intercept has higher priority, i.e. this is
* only relevant if *only* the selective intercept is enabled.
*/
if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) ||
!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))
break;
/* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */
if (info->intercept == x86_intercept_clts)
break;
/* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */
if (info->intercept == x86_intercept_lmsw) {
icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
break;
}
/*
* MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit
* other than SVM_CR0_SELECTIVE_MASK is changed.
*/
cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
if (cr0 ^ val)
icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
break;
}
case SVM_EXIT_READ_DR0:
@@ -5005,6 +5026,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
.emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
.has_emulated_msr = svm_has_emulated_msr,
.vcpu_precreate = svm_vcpu_precreate,
.vcpu_create = svm_vcpu_create,
.vcpu_free = svm_vcpu_free,
.vcpu_reset = svm_vcpu_reset,
@@ -5309,7 +5331,9 @@ static __init int svm_hardware_setup(void)
if (nested) {
pr_info("Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
kvm_enable_efer_bits(EFER_SVME);
if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ))
kvm_enable_efer_bits(EFER_LMSLE);
r = nested_svm_init_msrpm_merge_offsets();
if (r)

View File

@@ -117,9 +117,6 @@ struct kvm_sev_info {
cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
};
#define SEV_POLICY_NODBG BIT_ULL(0)
#define SNP_POLICY_DEBUG BIT_ULL(19)
struct kvm_svm {
struct kvm kvm;
@@ -807,6 +804,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
bool __init avic_hardware_setup(void);
void avic_hardware_unsetup(void);
int avic_alloc_physical_id_table(struct kvm *kvm);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);

View File

@@ -52,11 +52,23 @@
* there must not be any returns or indirect branches between this code
* and vmentry.
*/
movl SVM_spec_ctrl(%_ASM_DI), %eax
cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
#ifdef CONFIG_X86_64
mov SVM_spec_ctrl(%rdi), %rdx
cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
je 801b
movl %edx, %eax
shr $32, %rdx
#else
mov SVM_spec_ctrl(%edi), %eax
mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
xor %eax, %ecx
mov SVM_spec_ctrl + 4(%edi), %edx
mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi
xor %edx, %esi
or %esi, %ecx
je 801b
#endif
mov $MSR_IA32_SPEC_CTRL, %ecx
xor %edx, %edx
wrmsr
jmp 801b
.endm
@@ -81,13 +93,25 @@
jnz 998f
rdmsr
movl %eax, SVM_spec_ctrl(%_ASM_DI)
movl %edx, SVM_spec_ctrl + 4(%_ASM_DI)
998:
/* Now restore the host value of the MSR if different from the guest's. */
movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
cmp SVM_spec_ctrl(%_ASM_DI), %eax
#ifdef CONFIG_X86_64
mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx
cmp SVM_spec_ctrl(%rdi), %rdx
je 901b
xor %edx, %edx
movl %edx, %eax
shr $32, %rdx
#else
mov PER_CPU_VAR(x86_spec_ctrl_current), %eax
mov SVM_spec_ctrl(%edi), %esi
xor %eax, %esi
mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx
mov SVM_spec_ctrl + 4(%edi), %edi
xor %edx, %edi
or %edi, %esi
je 901b
#endif
wrmsr
jmp 901b
.endm
@@ -136,7 +160,7 @@ SYM_FUNC_START(__svm_vcpu_run)
mov %_ASM_ARG1, %_ASM_DI
.endif
/* Clobbers RAX, RCX, RDX. */
/* Clobbers RAX, RCX, RDX (and ESI on 32-bit), consumes RDI (@svm). */
RESTORE_GUEST_SPEC_CTRL
/*
@@ -213,7 +237,10 @@ SYM_FUNC_START(__svm_vcpu_run)
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
/* Clobbers RAX, RCX, RDX. */
/*
* Clobbers RAX, RCX, RDX (and ESI, EDI on 32-bit), consumes RDI (@svm)
* and RSP (pointer to @spec_ctrl_intercepted).
*/
RESTORE_HOST_SPEC_CTRL
/*
@@ -333,7 +360,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov %rdi, SEV_ES_RDI (%rdx)
mov %rsi, SEV_ES_RSI (%rdx)
/* Clobbers RAX, RCX, RDX (@hostsa). */
/* Clobbers RAX, RCX, and RDX (@hostsa), consumes RDI (@svm). */
RESTORE_GUEST_SPEC_CTRL
/* Get svm->current_vmcb->pa into RAX. */

View File

@@ -9332,6 +9332,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt,
int emulation_type)
{
u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type);
switch (ctxt->b) {
case 0xcc:
return vector == BP_VECTOR;
case 0xcd:
return vector == ctxt->src.val;
case 0xce:
return vector == OF_VECTOR;
default:
return false;
}
}
/*
* Decode an instruction for emulation. The caller is responsible for handling
* code breakpoints. Note, manually detecting code breakpoints is unnecessary
@@ -9442,6 +9459,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* injecting single-step #DBs.
*/
if (emulation_type & EMULTYPE_SKIP) {
if (emulation_type & EMULTYPE_SKIP_SOFT_INT &&
!is_soft_int_instruction(ctxt, emulation_type))
return 0;
if (ctxt->mode != X86EMUL_MODE_PROT64)
ctxt->eip = (u32)ctxt->_eip;
else

View File

@@ -2777,6 +2777,43 @@ void sev_platform_shutdown(void)
}
EXPORT_SYMBOL_GPL(sev_platform_shutdown);
u64 sev_get_snp_policy_bits(void)
{
struct psp_device *psp = psp_master;
struct sev_device *sev;
u64 policy_bits;
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return 0;
if (!psp || !psp->sev_data)
return 0;
sev = psp->sev_data;
policy_bits = SNP_POLICY_MASK_BASE;
if (sev->snp_plat_status.feature_info) {
if (sev->snp_feat_info_0.ecx & SNP_RAPL_DISABLE_SUPPORTED)
policy_bits |= SNP_POLICY_MASK_RAPL_DIS;
if (sev->snp_feat_info_0.ecx & SNP_CIPHER_TEXT_HIDING_SUPPORTED)
policy_bits |= SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM;
if (sev->snp_feat_info_0.ecx & SNP_AES_256_XTS_POLICY_SUPPORTED)
policy_bits |= SNP_POLICY_MASK_MEM_AES_256_XTS;
if (sev->snp_feat_info_0.ecx & SNP_CXL_ALLOW_POLICY_SUPPORTED)
policy_bits |= SNP_POLICY_MASK_CXL_ALLOW;
if (sev_version_greater_or_equal(1, 58))
policy_bits |= SNP_POLICY_MASK_PAGE_SWAP_DISABLE;
}
return policy_bits;
}
EXPORT_SYMBOL_GPL(sev_get_snp_policy_bits);
void sev_dev_destroy(struct psp_device *psp)
{
struct sev_device *sev = psp->sev_data;

View File

@@ -14,6 +14,39 @@
#include <uapi/linux/psp-sev.h>
/* As defined by SEV API, under "Guest Policy". */
#define SEV_POLICY_MASK_NODBG BIT(0)
#define SEV_POLICY_MASK_NOKS BIT(1)
#define SEV_POLICY_MASK_ES BIT(2)
#define SEV_POLICY_MASK_NOSEND BIT(3)
#define SEV_POLICY_MASK_DOMAIN BIT(4)
#define SEV_POLICY_MASK_SEV BIT(5)
#define SEV_POLICY_MASK_API_MAJOR GENMASK(23, 16)
#define SEV_POLICY_MASK_API_MINOR GENMASK(31, 24)
/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
#define SNP_POLICY_MASK_SMT BIT_ULL(16)
#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
#define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18)
#define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
#define SNP_POLICY_MASK_CXL_ALLOW BIT_ULL(21)
#define SNP_POLICY_MASK_MEM_AES_256_XTS BIT_ULL(22)
#define SNP_POLICY_MASK_RAPL_DIS BIT_ULL(23)
#define SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM BIT_ULL(24)
#define SNP_POLICY_MASK_PAGE_SWAP_DISABLE BIT_ULL(25)
/* Base SEV-SNP policy bitmask for minimum supported SEV firmware version */
#define SNP_POLICY_MASK_BASE (SNP_POLICY_MASK_API_MINOR | \
SNP_POLICY_MASK_API_MAJOR | \
SNP_POLICY_MASK_SMT | \
SNP_POLICY_MASK_RSVD_MBO | \
SNP_POLICY_MASK_MIGRATE_MA | \
SNP_POLICY_MASK_DEBUG | \
SNP_POLICY_MASK_SINGLE_SOCKET)
#define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */
/**
@@ -849,7 +882,10 @@ struct snp_feature_info {
u32 edx;
} __packed;
#define SNP_RAPL_DISABLE_SUPPORTED BIT(2)
#define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3)
#define SNP_AES_256_XTS_POLICY_SUPPORTED BIT(4)
#define SNP_CXL_ALLOW_POLICY_SUPPORTED BIT(5)
#ifdef CONFIG_CRYPTO_DEV_SP_PSP
@@ -995,6 +1031,7 @@ void *snp_alloc_firmware_page(gfp_t mask);
void snp_free_firmware_page(void *addr);
void sev_platform_shutdown(void);
bool sev_is_snp_ciphertext_hiding_supported(void);
u64 sev_get_snp_policy_bits(void);
#else /* !CONFIG_CRYPTO_DEV_SP_PSP */