@@ -21,6 +21,7 @@ import (
2121 "errors"
2222 "net/http"
2323 "os/exec"
24+ "strconv"
2425 "strings"
2526 "time"
2627
@@ -33,7 +34,7 @@ import (
3334type healthChecker struct {
3435 component string
3536 enableRepair bool
36- healthCheckFunc func () bool
37+ healthCheckFunc func () ( bool , error )
3738 // The repair is "best-effort" and ignores the error from the underlying actions.
3839 // The bash commands to kill the process will fail if the service is down and hence ignore.
3940 repairFunc func ()
@@ -102,41 +103,37 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
102103}
103104
104105// getHealthCheckFunc returns the health check function based on the component.
105- func getHealthCheckFunc (hco * options.HealthCheckerOptions ) func () bool {
106+ func getHealthCheckFunc (hco * options.HealthCheckerOptions ) func () ( bool , error ) {
106107 switch hco .Component {
107108 case types .KubeletComponent :
108- return func () bool {
109- httpClient := http.Client {Timeout : hco .HealthCheckTimeout }
110- response , err := httpClient .Get (types .KubeletHealthCheckEndpoint )
111- if err != nil || response .StatusCode != http .StatusOK {
112- return false
113- }
114- return true
115- }
109+ return getKubeletHealthCheckFunc (hco .HealthCheckTimeout )
116110 case types .DockerComponent :
117- return func () bool {
111+ return func () ( bool , error ) {
118112 if _ , err := execCommand (hco .HealthCheckTimeout , "docker" , "ps" ); err != nil {
119- return false
113+ return false , nil
120114 }
121- return true
115+ return true , nil
122116 }
123117 case types .CRIComponent :
124- return func () bool {
118+ return func () ( bool , error ) {
125119 if _ , err := execCommand (hco .HealthCheckTimeout , hco .CriCtlPath , "--runtime-endpoint=" + hco .CriSocketPath , "--image-endpoint=" + hco .CriSocketPath , "pods" ); err != nil {
126- return false
120+ return false , nil
127121 }
128- return true
122+ return true , nil
129123 }
130124 }
131125 return nil
132126}
133127
134128// CheckHealth checks for the health of the component and tries to repair if enabled.
135129// Returns true if healthy, false otherwise.
136- func (hc * healthChecker ) CheckHealth () bool {
137- healthy := hc .healthCheckFunc ()
130+ func (hc * healthChecker ) CheckHealth () (bool , error ) {
131+ healthy , err := hc .healthCheckFunc ()
132+ if err != nil {
133+ return healthy , err
134+ }
138135 if healthy {
139- return true
136+ return true , nil
140137 }
141138 // The service is unhealthy.
142139 // Attempt repair based on flag.
@@ -152,14 +149,13 @@ func (hc *healthChecker) CheckHealth() bool {
152149 hc .repairFunc ()
153150 }
154151 }
155- return false
152+ return false , nil
156153}
157154
158155// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
159156func execCommand (timeout time.Duration , command string , args ... string ) (string , error ) {
160157 ctx , cancel := context .WithTimeout (context .Background (), timeout )
161158 defer cancel ()
162-
163159 cmd := exec .CommandContext (ctx , command , args ... )
164160 out , err := cmd .Output ()
165161 if err != nil {
@@ -168,3 +164,66 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
168164 }
169165 return strings .TrimSuffix (string (out ), "\n " ), nil
170166}
167+
168+ // kubeletHttpHealthCheck checks the health api response on kubelet.
169+ // Returns true for healthy, false otherwise.
170+ func kubeletHttpHealthCheck (healthCheckTimeout time.Duration ) bool {
171+ httpClient := http.Client {Timeout : healthCheckTimeout }
172+ response , err := httpClient .Get (types .KubeletHealthCheckEndpoint )
173+ if err != nil || response .StatusCode != http .StatusOK {
174+ glog .Info ("kubelet failed http health check" )
175+ return false
176+ }
177+ return true
178+ }
179+
180+ // kubeletConnectionHealthCheck checks for the kubelet-apiserver connection issue
181+ // by checking repeated occurrences of log "use of closed network connection" in kubelet logs.
182+ // Returns true if the pattern does not exist 10 times since start of service or the last 10 min, false otherwise.
183+ func kubeletConnectionHealthCheck () (bool , error ) {
184+ kubeletUptimeFunc := getUptimeFunc (types .KubeletComponent )
185+ uptime , err := kubeletUptimeFunc ()
186+ if err != nil {
187+ return true , err
188+ }
189+ logStartTime := time .Now ().Add (- uptime ).Format (types .LogParsingTimeLayout )
190+ if err != nil {
191+ return true , err
192+ }
193+ out , err := execCommand (types .CmdTimeout , "/bin/sh" , "-c" ,
194+ // Query kubelet logs since the logStartTime
195+ `journalctl --unit kubelet --since "` + logStartTime +
196+ // Grep the pattern for lost connection
197+ `" | grep -i "` + types .KubeletClosedConnectionLogPattern +
198+ // Get the count of occurrences
199+ `" | wc -l` )
200+ if err != nil {
201+ return true , err
202+ }
203+ occurrences , err := strconv .Atoi (out )
204+ if err != nil {
205+ return true , err
206+ }
207+ if occurrences >= types .KubeletClosedConnectionLogPatternThresholdCount {
208+ glog .Infof ("kubelet failed apiserver connection check, log pattern occurrences: %v" , occurrences )
209+ return false , nil
210+ }
211+ return true , nil
212+ }
213+
214+ // getKubeletHealthCheckFunc returns a function that checks for kubelet health and
215+ // return false if identified as unhealthy, true otherwise.
216+ func getKubeletHealthCheckFunc (healthCheckTimeout time.Duration ) func () (bool , error ) {
217+ return func () (bool , error ) {
218+ httpHealthy := kubeletHttpHealthCheck (healthCheckTimeout )
219+ connectionHealthy , err := kubeletConnectionHealthCheck ()
220+ // The plugin will return Unknown status code in case there is any error in
221+ // checking kubelet health.
222+ if err != nil {
223+ glog .Infof ("Error in determining apiserver connection health: %v" , err )
224+ return false , err
225+ }
226+ healthy := httpHealthy && connectionHealthy
227+ return healthy , nil
228+ }
229+ }
0 commit comments