摘要:本文分析了DeamonSetController及PriorityClass Validate时,对CriticalPod的所做的特殊处理。
在DaemonSetController判断某个node上是否要运行某个DaemonSet时,会调用DaemonSetsController.simulate来分析PredicateFailureReason。
pkg/controller/daemon/daemon_controller.go:1206
func (dsc *DaemonSetsController) simulate(newPod *v1.Pod, node *v1.Node, ds *apps.DaemonSet) ([]algorithm.PredicateFailureReason, *schedulercache.NodeInfo, error) {
// DaemonSet pods shouldn't be deleted by NodeController in case of node problems.
// Add infinite toleration for taint notReady:NoExecute here
// to survive taint-based eviction enforced by NodeController
// when node turns not ready.
v1helper.AddOrUpdateTolerationInPod(newPod, &v1.Toleration{
Key: algorithm.TaintNodeNotReady,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoExecute,
})
// DaemonSet pods shouldn't be deleted by NodeController in case of node problems.
// Add infinite toleration for taint unreachable:NoExecute here
// to survive taint-based eviction enforced by NodeController
// when node turns unreachable.
v1helper.AddOrUpdateTolerationInPod(newPod, &v1.Toleration{
Key: algorithm.TaintNodeUnreachable,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoExecute,
})
// According to TaintNodesByCondition, all DaemonSet pods should tolerate
// MemoryPressure and DisPressure taints, and the critical pods should tolerate
// OutOfDisk taint additional.
v1helper.AddOrUpdateTolerationInPod(newPod, &v1.Toleration{
Key: algorithm.TaintNodeDiskPressure,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoSchedule,
})
v1helper.AddOrUpdateTolerationInPod(newPod, &v1.Toleration{
Key: algorithm.TaintNodeMemoryPressure,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoSchedule,
})
// TODO(#48843) OutOfDisk taints will be removed in 1.10
if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
kubelettypes.IsCriticalPod(newPod) {
v1helper.AddOrUpdateTolerationInPod(newPod, &v1.Toleration{
Key: algorithm.TaintNodeOutOfDisk,
Operator: v1.TolerationOpExists,
Effect: v1.TaintEffectNoSchedule,
})
}
...
_, reasons, err := Predicates(newPod, nodeInfo)
return reasons, nodeInfo, err
}
OutOfDisk:NoSchedule
Toleration。在simulate中,还会像类似scheduler一样,进行Predicates处理。Predicates过程中也对CriticalPod做了区分对待。
pkg/controller/daemon/daemon_controller.go:1413
// Predicates checks if a DaemonSet's pod can be scheduled on a node using GeneralPredicates
// and PodToleratesNodeTaints predicate
func Predicates(pod *v1.Pod, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
var predicateFails []algorithm.PredicateFailureReason
// If ScheduleDaemonSetPods is enabled, only check nodeSelector and nodeAffinity.
if false /*disabled for 1.10*/ && utilfeature.DefaultFeatureGate.Enabled(features.ScheduleDaemonSetPods) {
fit, reasons, err := nodeSelectionPredicates(pod, nil, nodeInfo)
if err != nil {
return false, predicateFails, err
}
if !fit {
predicateFails = append(predicateFails, reasons...)
}
return len(predicateFails) == 0, predicateFails, nil
}
critical := utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
kubelettypes.IsCriticalPod(pod)
fit, reasons, err := predicates.PodToleratesNodeTaints(pod, nil, nodeInfo)
if err != nil {
return false, predicateFails, err
}
if !fit {
predicateFails = append(predicateFails, reasons...)
}
if critical {
// If the pod is marked as critical and support for critical pod annotations is enabled,
// check predicates for critical pods only.
fit, reasons, err = predicates.EssentialPredicates(pod, nil, nodeInfo)
} else {
fit, reasons, err = predicates.GeneralPredicates(pod, nil, nodeInfo)
}
if err != nil {
return false, predicateFails, err
}
if !fit {
predicateFails = append(predicateFails, reasons...)
}
return len(predicateFails) == 0, predicateFails, nil
}
pkg/scheduler/algorithm/predicates/predicates.go:1076
// noncriticalPredicates are the predicates that only non-critical pods need
func noncriticalPredicates(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
var predicateFails []algorithm.PredicateFailureReason
fit, reasons, err := PodFitsResources(pod, meta, nodeInfo)
if err != nil {
return false, predicateFails, err
}
if !fit {
predicateFails = append(predicateFails, reasons...)
}
return len(predicateFails) == 0, predicateFails, nil
}
因此,对于CriticalPod,DeamonSetController进行Predicate时不会进行PodFitsResources检查。
在Kubernetes 1.11中,很重要的个更新就是,Priority和Preemption从alpha升级为Beta了,并且是Enabled by default。
Kubernetes Version | Priority and Preemption State | Enabled by default |
---|---|---|
1.8 | alpha | no |
1.9 | alpha | no |
1.10 | alpha | no |
1.11 | beta | yes |
PriorityClass是属于scheduling.k8s.io/v1alpha1
GroupVersion的,在client提交创建PriorityClass请求后,写入etcd前,会进行合法性检查(Validate),这其中就有对SystemClusterCritical和SystemNodeCritical两个PriorityClass的特殊对待。
pkg/apis/scheduling/validation/validation.go:30
// ValidatePriorityClass tests whether required fields in the PriorityClass are
// set correctly.
func ValidatePriorityClass(pc *scheduling.PriorityClass) field.ErrorList {
...
// If the priorityClass starts with a system prefix, it must be one of the
// predefined system priority classes.
if strings.HasPrefix(pc.Name, scheduling.SystemPriorityClassPrefix) {
if is, err := scheduling.IsKnownSystemPriorityClass(pc); !is {
allErrs = append(allErrs, field.Forbidden(field.NewPath("metadata", "name"), "priority class names with '"+scheduling.SystemPriorityClassPrefix+"' prefix are reserved for system use only. error: "+err.Error()))
}
}
...
return allErrs
}
// IsKnownSystemPriorityClass checks that "pc" is equal to one of the system PriorityClasses.
// It ignores "description", labels, annotations, etc. of the PriorityClass.
func IsKnownSystemPriorityClass(pc *PriorityClass) (bool, error) {
for _, spc := range systemPriorityClasses {
if spc.Name == pc.Name {
if spc.Value != pc.Value {
return false, fmt.Errorf("value of %v PriorityClass must be %v", spc.Name, spc.Value)
}
if spc.GlobalDefault != pc.GlobalDefault {
return false, fmt.Errorf("globalDefault of %v PriorityClass must be %v", spc.Name, spc.GlobalDefault)
}
return true, nil
}
}
return false, fmt.Errorf("%v is not a known system priority class", pc.Name)
}
system-cluster-critical
或者system-node-critical
之一。否则就会Validate Error,拒绝提交。system-cluster-critical
或者system-node-critical
,那么要求globalDefault必须为false,即system-cluster-critical
或者system-node-critical
不能是全局默认的PriorityClass。另外,在PriorityClass进行Update时,目前是不允许其Name和Value的,也就是说只能更新Description和globalDefault。
pkg/apis/scheduling/helpers.go:27
// SystemPriorityClasses define system priority classes that are auto-created at cluster bootstrapping.
// Our API validation logic ensures that any priority class that has a system prefix or its value
// is higher than HighestUserDefinablePriority is equal to one of these SystemPriorityClasses.
var systemPriorityClasses = []*PriorityClass{
{
ObjectMeta: metav1.ObjectMeta{
Name: SystemNodeCritical,
},
Value: SystemCriticalPriority + 1000,
Description: "Used for system critical pods that must not be moved from their current node.",
},
{
ObjectMeta: metav1.ObjectMeta{
Name: SystemClusterCritical,
},
Value: SystemCriticalPriority,
Description: "Used for system critical pods that must run in the cluster, but can be moved to another node if necessary.",
},
}
因此DeamonSetController及PriorityClass Validate时,对CriticalPod的特殊处理总结如下:
OutOfDisk:NoSchedule
Toleration。system-cluster-critical
或者system-node-critical
之一。否则就会Validate Error,拒绝提交。system-cluster-critical
或者system-node-critical
,那么要求globalDefault必须为false,即system-cluster-critical
或者system-node-critical
不能是全局默认的PriorityClass。