kube-proxy 配置不当导致 service 异常
之前调研 nlb 后端获取真实 ip 的特性,发现当 kube-proxy 报错如下的时候就会发生生成的 iptables 规则不符合预期,即丢弃当前 service node port 的流量。
1 -A KUBE-XLB-HCMTY43AHEJZZDHI -m comment --comment "2048-game/service-2048: has no local endpoints" -j KUBE-MARK-DROP
实际情况是当前节点上运行着 pod
1 2 3 4 5 $ kubectl --kubeconfig kubeconfig -n 2048-game get pod -o wide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE 2048-deployment-7bddb7dc45-c2dk6 1/1 Running 0 109m ip-10-188-166-140.ec2.internal <none> 2048-deployment-7bddb7dc45-nc88f 1/1 Running 0 109m ip-10-188-166-140.ec2.internal <none>
而 kubelet 的健康检查却说没有
1 2 3 4 5 6 7 8 # curl http: { "service" : { "namespace" : "2048-game" , "name" : "service-2048" } , "localEndpoints" : 0 }
错误信息收集 检查 kube-proxy 日志发现是 kube-proxy 解析 ip 地址有报错信息
1 2 3 W0525 07:13:03.916890 1 server.go:604] Failed to retrieve node info: nodes "ip-10-188-166-140" not found I0525 07:13:03.916921 1 server_others.go:148] Using iptables Proxier. W0525 07:13:03.917047 1 proxier.go:312] invalid nodeIP, initializing kube-proxy with as nodeIP
在 kube-proxy 的逻辑中,这个 ip 地址需要和 endpoint 对象中的 nodeName 做里面的地址匹配
1 2 3 4 5 6 7 8 9 10 11 12 13 14 .... subsets: - addresses: - ip: 10.188 .166 .183 nodeName: ip-10-188-166-140.ec2.internal targetRef: kind: Pod name: 2048 -deployment-7bddb7dc45-c2dk6 namespace: 2048 -game resourceVersion: "1695104" uid: e52b247a-9e51-11ea-b101-02d6ecb85825 - ip: 10.188 .166 .232 nodeName: ip-10-188-166-140.ec2.internal ...
看下 kube-proxy 的启动关键参数
1 2 3 4 5 ... I0525 07:13:03.848460 1 flags.go:33] FLAG: --bind-address="" ... I0525 07:13:03.848697 1 flags.go:33] FLAG: --hostname-override="ip-10-188-166-140.ec2.internal" ...
代码逻辑分析 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 func NewProxier (ipt utiliptables.Interface, sysctl utilsysctl.Interface, exec utilexec.Interface, syncPeriod time.Duration, minSyncPeriod time.Duration, masqueradeAll bool , masqueradeBit int , clusterCIDR string , hostname string , nodeIP net.IP, recorder record.EventRecorder, healthzServer healthcheck.HealthzUpdater, nodePortAddresses []string , ) (*Proxier, error ) {... if nodeIP == nil { glog.Warningf("invalid nodeIP, initializing kube-proxy with as nodeIP" ) nodeIP = net.ParseIP("" ) }
1 2 3 4 5 6 7 8 9 10 11 12 13 nodeIP := net.ParseIP(config.BindAddress) if nodeIP.IsUnspecified() { nodeIP = getNodeIP(client, hostname) } if proxyMode == proxyModeIPTables { glog.V(0 ).Info("Using iptables Proxier." ) if config.IPTables.MasqueradeBit == nil { return nil , fmt.Errorf("unable to read IPTables MasqueradeBit from config" ) } proxierIPTables, err := iptables.NewProxier(
1 2 3 4 5 6 7 8 9 10 11 12 13 14 func getNodeIP (client clientset.Interface, hostname string ) net.IP { var nodeIP net.IP node, err := client.CoreV1().Nodes().Get(hostname, metav1.GetOptions{}) if err != nil { glog.Warningf("Failed to retrieve node info: %v" , err) return nil } nodeIP, err = utilnode.GetNodeHostIP(node) if err != nil { glog.Warningf("Failed to retrieve node IP: %v" , err) return nil } return nodeIP }
这个错误信息就很奇怪,明明前面已经启动了 --hostname-override="ip-10-188-166-140.ec2.internal"
1 2 3 4 5 hostname, err := utilnode.GetHostname(config.HostnameOverride) if err != nil { return nil , err }
1 fs.StringVar(&o.config.HostnameOverride, "hostname-override" , o.config.HostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname." )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 func newProxyServer ( config *proxyconfigapi.KubeProxyConfiguration, cleanupAndExit bool , cleanupIPVS bool , scheme *runtime.Scheme, master string ) (*ProxyServer, error ) { if config == nil { return nil , errors.New("config is required" ) } ... 移除不相关代码 hostname, err := utilnode.GetHostname(config.HostnameOverride) if err != nil { return nil , err }
1 2 3 4 func NewProxyServer (o *Options) (*ProxyServer, error ) { return newProxyServer(o.config, o.CleanupAndExit, o.CleanupIPVS, o.scheme, o.master) }
调用 NewProxyServer()
1 2 3 4 5 6 7 8 func (o *Options) Run() error { if len (o.WriteConfigTo) > 0 { return o.writeConfigFile() } proxyServer, err := NewProxyServer(o) ... }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 func NewProxyCommand () *cobra.Command { opts := NewOptions() cmd := &cobra.Command{ ... Run: func (cmd *cobra.Command, args []string ) { verflag.PrintAndExitIfRequested() utilflag.PrintFlags(cmd.Flags()) if err := initForOS(opts.WindowsService); err != nil { glog.Fatalf("failed OS init: %v" , err) } if err := opts.Complete(); err != nil { glog.Fatalf("failed complete: %v" , err) } if err := opts.Validate(args); err != nil { glog.Fatalf("failed validate: %v" , err) } glog.Fatal(opts.Run()) }, } var err error opts.config, err = opts.ApplyDefaults(opts.config) if err != nil { glog.Fatalf("unable to create flag defaults: %v" , err) } opts.AddFlags(cmd.Flags()) cmd.MarkFlagFilename("config" , "yaml" , "yml" , "json" ) return cmd }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 func (o *Options) Complete() error { if len (o.ConfigFile) == 0 && len (o.WriteConfigTo) == 0 { glog.Warning("WARNING: all flags other than --config, --write-config-to, and --cleanup are deprecated. Please begin using a config file ASAP." ) o.applyDeprecatedHealthzPortToConfig() } if len (o.ConfigFile) > 0 { if c, err := o.loadConfigFromFile(o.ConfigFile); err != nil { return err } else { o.config = c } } err := utilfeature.DefaultFeatureGate.SetFromMap(o.config.FeatureGates) if err != nil { return err } return nil }
验证结论 1 2 3 4 5 6 7 8 9 10 /usr/local/bin/kube-proxy --hostname-override=ip-10-188-166-140.ec2.internal --v=8 ... I0525 10:19:12.711084 1543 flags.go:33] FLAG: --hostname-override="ip-10-188-166-140.ec2.internal" ... W0525 10:19:12.711412 1543 server.go:194] WARNING: all flags other than --config, --write-config-to, and --cleanup are deprecated. Please begin using a config file ASAP. I0525 10:19:12.711447 1543 feature_gate.go:206] feature gates: &{map[]} I0525 10:19:12.713822 1543 iptables.go:611] couldn't get iptables-restore version; assuming it doesn't support --wait I0525 10:19:12.721048 1543 server.go:412] Neither kubeconfig file nor master URL was specified. Falling back to in-cluster config. W0525 10:19:12.722352 1543 server_others.go:295] Flag proxy-mode="" unknown, assuming iptables proxy I0525 10:19:12.723544 1543 round_trippers.go:383] GET
的配置参数,发现在 kubelet
和 配置文件中同时指定,命令行参数设定值具有更高的优先级。