Posted on 

cloud provider route controller

​ 当 pod 的 CIDR 和所属的 node 不属于同一个 cidr 上时候,在部分云上就会有 pod 和 node 网络互通的问题,而 route controller 就是被设计用来创建路由解决这个问题的方案。

​ 如果计划让 k8s 启动的 pod 从指定的 cidr 中分配 ip 常见的方式是通过指定 KCM 的启动参数cluster-cidr来指定 POD CIDR,service 也可以通过 KCM 的service-cluster-ip-range参数指定 CIDR。

​ cloud provider 启动时候通过判断 AllocateNodeCIDRs 与 ConfigureCloudRoutes 的与逻辑来判断 k8s 的 pod 是否从指定的 cidr 中分配 IP,AllocateNodeCIDRs 字段的意义是 AllocateNodeCIDRs enables CIDRs for Pods to be allocated and, if ConfigureCloudRoutes is true, to be set on the cloud provider.,ConfigureCloudRoutes 字段的意义是configureCloudRoutes enables CIDRs allocated with allocateNodeCIDRs to be configured on the cloud provider.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// startControllers starts the cloud specific controller loops.
func startControllers(c *cloudcontrollerconfig.CompletedConfig, stop <-chan struct{}, cloud cloudprovider.Interface) error {
...
// If CIDRs should be allocated for pods and set on the CloudProvider, then start the route controller
if c.ComponentConfig.KubeCloudShared.AllocateNodeCIDRs && c.ComponentConfig.KubeCloudShared.ConfigureCloudRoutes {
// 这里判断是否实现了 cloud provider route 相关接口
if routes, ok := cloud.Routes(); !ok {
glog.Warning("configure-cloud-routes is set, but cloud provider does not support routes. Will not configure cloud provider routes.")
} else {
var clusterCIDR *net.IPNet
if len(strings.TrimSpace(c.ComponentConfig.KubeCloudShared.ClusterCIDR)) != 0 {
_, clusterCIDR, err = net.ParseCIDR(c.ComponentConfig.KubeCloudShared.ClusterCIDR)
if err != nil {
glog.Warningf("Unsuccessful parsing of cluster CIDR %v: %v", c.ComponentConfig.KubeCloudShared.ClusterCIDR, err)
}
}

routeController := routecontroller.New(routes, client("route-controller"), c.SharedInformers.Core().V1().Nodes(), c.ComponentConfig.KubeCloudShared.ClusterName, clusterCIDR)
go routeController.Run(stop, c.ComponentConfig.KubeCloudShared.RouteReconciliationPeriod.Duration)
...
}

​ 这个 Run 函数保持着 cloud provider 运行 controller 的一贯风格,并没有什么特别要注意的地方。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
func (rc *RouteController) Run(stopCh <-chan struct{}, syncPeriod time.Duration) {
...

// TODO: If we do just the full Resync every 5 minutes (default value)
// that means that we may wait up to 5 minutes before even starting
// creating a route for it. This is bad.
// We should have a watch on node and if we observe a new node (with CIDR?)
// trigger reconciliation for that node.
go wait.NonSlidingUntil(func() {
if err := rc.reconcileNodeRoutes(); err != nil {
glog.Errorf("Couldn't reconcile node routes: %v", err)
}
}, syncPeriod, stopCh)
...
}

​ 这里地方的实现唯一值得说道的是reconcile的函数命名,在 k8s 的 controller 中讲现实世界变成声明式中的过程称为reconcile

1
2
3
4
5
func (rc *RouteController) reconcileNodeRoutes() error {
routeList, err := rc.routes.ListRoutes(context.TODO(), rc.clusterName)
...
return rc.reconcile(nodes, routeList)
}

​ 这个是实际 reconcile 的过程,就是找到现实世界和期望世界差距,然后通过 cloud proivder route所提供的接口操作云上的route资源进行添加/删除操作将其变成期望的模样。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
func (rc *RouteController) reconcile(nodes []*v1.Node, routes []*cloudprovider.Route) error {
// nodeCIDRs maps nodeName->nodeCIDR
nodeCIDRs := make(map[types.NodeName]string)
// routeMap maps routeTargetNode->route
routeMap := make(map[types.NodeName]*cloudprovider.Route)
for _, route := range routes {
if route.TargetNode != "" {
routeMap[route.TargetNode] = route
}
}

wg := sync.WaitGroup{}
rateLimiter := make(chan struct{}, maxConcurrentRouteCreations)

for _, node := range nodes {
// Skip if the node hasn't been assigned a CIDR yet.
if node.Spec.PodCIDR == "" {
continue
}
nodeName := types.NodeName(node.Name)
// Check if we have a route for this node w/ the correct CIDR.
r := routeMap[nodeName]
if r == nil || r.DestinationCIDR != node.Spec.PodCIDR {
// If not, create the route.
route := &cloudprovider.Route{
TargetNode: nodeName,
DestinationCIDR: node.Spec.PodCIDR,
}
nameHint := string(node.UID)
wg.Add(1)
go func(nodeName types.NodeName, nameHint string, route *cloudprovider.Route) {
defer wg.Done()
err := clientretry.RetryOnConflict(updateNetworkConditionBackoff, func() error {
startTime := time.Now()
// Ensure that we don't have more than maxConcurrentRouteCreations
// CreateRoute calls in flight.
rateLimiter <- struct{}{}
glog.Infof("Creating route for node %s %s with hint %s, throttled %v", nodeName, route.DestinationCIDR, nameHint, time.Since(startTime))
err := rc.routes.CreateRoute(context.TODO(), rc.clusterName, nameHint, route)
<-rateLimiter

rc.updateNetworkingCondition(nodeName, err == nil)
if err != nil {
msg := fmt.Sprintf("Could not create route %s %s for node %s after %v: %v", nameHint, route.DestinationCIDR, nodeName, time.Since(startTime), err)
if rc.recorder != nil {
rc.recorder.Eventf(
&v1.ObjectReference{
Kind: "Node",
Name: string(nodeName),
UID: types.UID(nodeName),
Namespace: "",
}, v1.EventTypeWarning, "FailedToCreateRoute", msg)
}
glog.V(4).Infof(msg)
return err
}
glog.Infof("Created route for node %s %s with hint %s after %v", nodeName, route.DestinationCIDR, nameHint, time.Now().Sub(startTime))
return nil
})
if err != nil {
glog.Errorf("Could not create route %s %s for node %s: %v", nameHint, route.DestinationCIDR, nodeName, err)
}
}(nodeName, nameHint, route)
} else {
// Update condition only if it doesn't reflect the current state.
_, condition := v1node.GetNodeCondition(&node.Status, v1.NodeNetworkUnavailable)
if condition == nil || condition.Status != v1.ConditionFalse {
rc.updateNetworkingCondition(types.NodeName(node.Name), true)
}
}
nodeCIDRs[nodeName] = node.Spec.PodCIDR
}
...
wg.Wait()
return nil
}

​ 目前腾讯云 TKE的 global router 网络模式就是符合上述描述,在 global router 模式下 route controller 会在节点启动的时候去云上注册路由,下图中节点 172.0.0.1 上 kubelet 上报 ready 时候 route controller 会去 vpc 中注册 10.0.0.0/24 的路由,并且 172.0.0.1 上的 pod 都是从 10.0.0.0/24 的 cidr 分配 IP 的,这样就实现了和 vpc 的互通。

image-20190423113516302
image-20190423113516302