kubernetes · Random-Liu · Nov 22, 2017 · Oct 18, 2017 · Random-Liu · Nov 15, 2017
diff --git a/README.md b/README.md
@@ -14,7 +14,8 @@ enabled by default in the GCE cluster.
 # Background
 There are tons of node problems could possibly affect the pods running on the
 node such as:
-* Hardware issues: Bad cpu, memory or disk;
+* Infrastructure daemon issues: ntp service down;
+* Hardware issues: Bad cpu, memory or disk, ntp service down;
 * Kernel issues: Kernel deadlock, corrupted file system;
 * Container runtime issues: Unresponsive runtime daemon;
 * ...
@@ -53,23 +54,30 @@ List of supported problem daemons:
 |----------------|:---------------:|:------------|
 | [KernelMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json) | KernelDeadlock | A system log monitor monitors kernel log and reports problem according to predefined rules. |
 | [AbrtAdaptor](https://github.com/kubernetes/node-problem-detector/blob/master/config/abrt-adaptor.json) | None | Monitor ABRT log messages and report them further. ABRT (Automatic Bug Report Tool) is health monitoring daemon able to catch kernel problems as well as application crashes of various kinds occurred on the host. For more information visit the [link](https://github.com/abrt). |
+| [CustomPluginMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json) | On-demand(According to users configuration) | A custom plugin monitor for node-problem-detector to invoke and check various node problems with user defined check scripts. See proposal [here](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#). |
 
 # Usage
 ## Flags
 * `--version`: Print current version of node-problem-detector.
+* `--address`: The address to bind the node problem detector server.
+* `--port`: The port to bind the node problem detector server. Use 0 to disable.
 * `--system-log-monitors`: List of paths to system log monitor configuration files, comma separated, e.g.
   [config/kernel-monitor.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json).
   Node problem detector will start a separate log monitor for each configuration. You can
   use different log monitors to monitor different system log.
+* `--custom-plugin-monitors`: List of paths to custom plugin monitor config files, comma separated, e.g.
+  [config/custom-plugin-monitor.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json).
+  Node problem detector will start a separate custom plugin monitor for each configuration. You can
+  use different custom plugin monitors to monitor different node problems.
 * `--apiserver-override`: A URI parameter used to customize how node-problem-detector
 connects the apiserver. The format is same as the
 [`source`](https://github.com/kubernetes/heapster/blob/master/docs/source-configuration.md#kubernetes)
 flag of [Heapster](https://github.com/kubernetes/heapster).
 For example, to run without auth, use the following config:
-```
-http://APISERVER_IP:APISERVER_PORT?inClusterConfig=false
-```
-Refer [heapster docs](https://github.com/kubernetes/heapster/blob/master/docs/source-configuration.md#kubernetes) for a complete list of available options.
+   ```
+   http://APISERVER_IP:APISERVER_PORT?inClusterConfig=false
+   ```
+   Refer [heapster docs](https://github.com/kubernetes/heapster/blob/master/docs/source-configuration.md#kubernetes) for a complete list of available options.
 * `--hostname-override`: A customized node name used for node-problem-detector to update conditions and emit events. node-problem-detector gets node name first from `hostname-override`, then `NODE_NAME` environment variable and finally fall back to `os.Hostname`.
 
 ## Build Image
@@ -138,4 +146,5 @@ For more scenarios, see [here](https://github.com/kubernetes/heapster/blob/maste
 # Links
 * [Design Doc](https://docs.google.com/document/d/1cs1kqLziG-Ww145yN6vvlKguPbQQ0psrSBnEqpy0pzE/edit?usp=sharing)
 * [Slides](https://docs.google.com/presentation/d/1bkJibjwWXy8YnB5fna6p-Ltiy-N5p01zUsA22wCNkXA/edit?usp=sharing)
+* [Plugin Interface Proposal](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#)
 * [Addon Manifest](https://github.com/kubernetes/kubernetes/tree/master/cluster/addons/node-problem-detector)
diff --git a/cmd/node_problem_detector.go b/cmd/node_problem_detector.go
@@ -27,9 +27,11 @@ import (
 	"github.com/spf13/pflag"
 
 	"k8s.io/node-problem-detector/cmd/options"
+	"k8s.io/node-problem-detector/pkg/custompluginmonitor"
 	"k8s.io/node-problem-detector/pkg/problemclient"
 	"k8s.io/node-problem-detector/pkg/problemdetector"
 	"k8s.io/node-problem-detector/pkg/systemlogmonitor"
+	"k8s.io/node-problem-detector/pkg/types"
 	"k8s.io/node-problem-detector/pkg/version"
 )
 
@@ -67,15 +69,24 @@ func main() {
 		os.Exit(0)
 	}
 
-	monitors := make(map[string]systemlogmonitor.LogMonitor)
+	monitors := make(map[string]types.Monitor)
 	for _, config := range npdo.SystemLogMonitorConfigPaths {
 		if _, ok := monitors[config]; ok {
-			// Skip the config if it's duplictaed.
-			glog.Warningf("Duplicated log monitor configuration %q", config)
+			// Skip the config if it's duplicated.
+			glog.Warningf("Duplicated monitor configuration %q", config)
 			continue
 		}
 		monitors[config] = systemlogmonitor.NewLogMonitorOrDie(config)
 	}
+
+	for _, config := range npdo.CustomPluginMonitorConfigPaths {
+		if _, ok := monitors[config]; ok {
+			// Skip the config if it's duplicated.
+			glog.Warningf("Duplicated monitor configuration %q", config)
+			continue
+		}
+		monitors[config] = custompluginmonitor.NewCustomPluginMonitorOrDie(config)
+	}
 	c := problemclient.NewClientOrDie(npdo)
 	p := problemdetector.NewProblemDetector(monitors, c)
 

diff --git a/cmd/options/options.go b/cmd/options/options.go
@@ -33,6 +33,9 @@ type NodeProblemDetectorOptions struct {
 	// SystemLogMonitorConfigPaths specifies the list of paths to system log monitor configuration
 	// files.
 	SystemLogMonitorConfigPaths []string
+	// CustomPluginMonitorConfigPaths specifies the list of paths to custom plugin monitor configuration
+	// files.
+	CustomPluginMonitorConfigPaths []string
 	// ApiServerOverride is the custom URI used to connect to Kubernetes ApiServer.
 	ApiServerOverride string
 	// PrintVersion is the flag determining whether version information is printed.
@@ -58,6 +61,8 @@ func NewNodeProblemDetectorOptions() *NodeProblemDetectorOptions {
 func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) {
 	fs.StringSliceVar(&npdo.SystemLogMonitorConfigPaths, "system-log-monitors",
 		[]string{}, "List of paths to system log monitor config files, comma separated.")
+	fs.StringSliceVar(&npdo.CustomPluginMonitorConfigPaths, "custom-plugin-monitors",
+		[]string{}, "List of paths to custom plugin monitor config files, comma separated.")
 	fs.StringVar(&npdo.ApiServerOverride, "apiserver-override",
 		"", "Custom URI used to connect to Kubernetes ApiServer")
 	fs.BoolVar(&npdo.PrintVersion, "version", false, "Print version information and quit")

diff --git a/config/custom-plugin-monitor.json b/config/custom-plugin-monitor.json
@@ -0,0 +1,32 @@
+{
+  "plugin": "custom",
+  "pluginConfig": {
+    "invoke_interval": "30s",
+    "timeout": "5s",
+    "max_output_length": 80,
+    "concurrency": 3
+  },
+  "source": "ntp-custom-plugin-monitor",
+  "conditions": [
+    {
+      "type": "NTPProblem",
+      "reason": "NTPIsUp",
+      "message": "ntp service is up"
+    }
+  ],
+  "rules": [
+    {
+      "type": "temporary",
+      "reason": "NTPIsDown",
+      "path": "./config/plugin/check_ntp.sh",
+      "timeout": "3s"
+    },
+    {
+      "type": "permanent",
+      "condition": "NTPProblem",
+      "reason": "NTPIsDown",
+      "path": "./config/plugin/check_ntp.sh",
+      "timeout": "3s"
+    }
+  ]
+}
diff --git a/config/plugin/check_ntp.sh b/config/plugin/check_ntp.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# NOTE: THIS NTP SERVICE CHECK SCRIPT ASSUME THAT NTP SERVICE IS RUNNING UNDER SYSTEMD.
+#       THIS IS JUST AN EXAMPLE. YOU CAN WRITE YOUR OWN NODE PROBLEM PLUGIN ON DEMAND.
+
+OK=0
+NONOK=1
+UNKNOWN=2
+
+which systemctl >/dev/null
+if [ $? -ne 0 ]; then
+    echo "Systemd is not supported"
+    exit $UNKNOWN
+fi
+
+systemctl status ntp.service | grep 'Active:' | grep -q running
+if [ $? -ne 0 ]; then
+    echo "NTP service is not running"
+    exit $NONOK
+fi
+
+echo "NTP service is running"
+exit $OK
diff --git a/pkg/custompluginmonitor/README.md b/pkg/custompluginmonitor/README.md
@@ -0,0 +1,7 @@
+# Custom Plugin Monitor
+
+Custom plugin monitor is a plugin mechanism for node-problem-detector. It will 
+extend node-problem-detector to execute any monitor scripts written in any language. 
+The monitor scripts must conform to the plugin protocol in exit code and standard 
+output. For more info about the plugin protocol, please refer to the
+[node-problem-detector plugin interface proposal](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#)
diff --git a/pkg/custompluginmonitor/custom_plugin_monitor.go b/pkg/custompluginmonitor/custom_plugin_monitor.go
@@ -0,0 +1,167 @@
+/*
+Copyright 2017 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package custompluginmonitor
+
+import (
+	"encoding/json"
+	"io/ioutil"
+	"time"
+
+	"github.com/golang/glog"
+
+	"k8s.io/node-problem-detector/pkg/custompluginmonitor/plugin"
+	cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
+	"k8s.io/node-problem-detector/pkg/types"
+	"k8s.io/node-problem-detector/pkg/util/tomb"
+)
+
+type customPluginMonitor struct {
+	config     cpmtypes.CustomPluginConfig
+	conditions []types.Condition
+	plugin     *plugin.Plugin
+	resultChan <-chan cpmtypes.Result
+	statusChan chan *types.Status
+	tomb       *tomb.Tomb
+}
+
+// NewCustomPluginMonitorOrDie create a new customPluginMonitor, panic if error occurs.
+func NewCustomPluginMonitorOrDie(configPath string) types.Monitor {
+	c := &customPluginMonitor{
+		tomb: tomb.NewTomb(),
+	}
+	f, err := ioutil.ReadFile(configPath)
+	if err != nil {
+		glog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
+	}
+	err = json.Unmarshal(f, &c.config)
+	if err != nil {
+		glog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
+	}
+	// Apply configurations
+	err = (&c.config).ApplyConfiguration()
+	if err != nil {
+		glog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
+	}
+
+	// Validate configurations
+	err = c.config.Validate()
+	if err != nil {
+		glog.Fatalf("Failed to validate custom plugin config %+v: %v", c.config, err)
+	}
+
+	glog.Infof("Finish parsing custom plugin monitor config file: %+v", c.config)
+
+	c.plugin = plugin.NewPlugin(c.config)
+	// A 1000 size channel should be big enough.
+	c.statusChan = make(chan *types.Status, 1000)
+	return c
+}
+
+func (c *customPluginMonitor) Start() (<-chan *types.Status, error) {
+	glog.Info("Start custom plugin monitor")
+	go c.plugin.Run()
+	go c.monitorLoop()
+	return c.statusChan, nil
+}
+
+func (c *customPluginMonitor) Stop() {
+	glog.Info("Stop custom plugin monitor")
+	c.tomb.Stop()
+}
+
+// monitorLoop is the main loop of log monitor.
+func (c *customPluginMonitor) monitorLoop() {
+	c.initializeStatus()
+
+	resultChan := c.plugin.GetResultChan()
+
+	for {
+		select {
+		case result := <-resultChan:
+			glog.V(3).Infof("Receive new plugin result: %+v", result)
+			status := c.generateStatus(result)
+			glog.Infof("New status generated: %+v", status)
+			c.statusChan <- status
+		case <-c.tomb.Stopping():
+			c.plugin.Stop()
+			glog.Infof("Custom plugin monitor stopped")
+			c.tomb.Done()
+			break
+		}
+	}
+}
+
+// generateStatus generates status from the plugin check result.
+func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Status {
+	timestamp := time.Now()
+	var events []types.Event
+	if result.Rule.Type == types.Temp {
+		// For temporary error only generate event when exit status is above warning
+		if result.ExitStatus >= cpmtypes.NonOK {
+			events = append(events, types.Event{
+				Severity:  types.Warn,
+				Timestamp: timestamp,
+				Reason:    result.Rule.Reason,
+				Message:   result.Message,
+			})
+		}
+	} else {
+		// For permanent error changes the condition
+		for i := range c.conditions {
+			condition := &c.conditions[i]
+			if condition.Type == result.Rule.Condition {
+				status := result.ExitStatus >= cpmtypes.NonOK
+				if condition.Status != status || condition.Reason != result.Rule.Reason {
+					condition.Transition = timestamp
+					condition.Message = result.Message
+				}
+				condition.Status = status
+				condition.Reason = result.Rule.Reason
+				break
+			}
+		}
+	}
+	return &types.Status{
+		Source: c.config.Source,
+		// TODO(random-liu): Aggregate events and conditions and then do periodically report.
+		Events:     events,
+		Conditions: c.conditions,
+	}
+}
+
+// initializeStatus initializes the internal condition and also reports it to the node problem detector.
+func (c *customPluginMonitor) initializeStatus() {
+	// Initialize the default node conditions
+	c.conditions = initialConditions(c.config.DefaultConditions)
+	glog.Infof("Initialize condition generated: %+v", c.conditions)
+	// Update the initial status
+	c.statusChan <- &types.Status{
+		Source:     c.config.Source,
+		Conditions: c.conditions,
+	}
+}
+
+func initialConditions(defaults []types.Condition) []types.Condition {
+	conditions := make([]types.Condition, len(defaults))
+	copy(conditions, defaults)
+	for i := range conditions {
+		// TODO(random-liu): Validate default conditions
+		conditions[i].Status = false
+		conditions[i].Transition = time.Now()
+	}
+	return conditions
+}