-
Notifications
You must be signed in to change notification settings - Fork 4.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a healthcheck to detect when OVS is restarted
A periodic background process watches for when OVS is reset to the default state and causes the entire process to restart. This avoids the need to order the SDN process with OVS, and makes it easier to run the process in a pod. In the future it should be possible to avoid restarting the process to perform this check.
- Loading branch information
1 parent
b0073eb
commit 189e581
Showing
3 changed files
with
190 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
package node | ||
|
||
import ( | ||
"fmt" | ||
"time" | ||
|
||
"github.com/golang/glog" | ||
|
||
utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||
utilwait "k8s.io/apimachinery/pkg/util/wait" | ||
|
||
"github.com/openshift/origin/pkg/util/ovs/ovsclient" | ||
) | ||
|
||
const ( | ||
ovsDialTimeout = 5 * time.Second | ||
ovsHealthcheckInterval = 30 * time.Second | ||
ovsRecoveryTimeout = 10 * time.Second | ||
ovsDialDefaultNetwork = "unix" | ||
ovsDialDefaultAddress "/var/run/openvswitch/db.sock" | ||
) | ||
|
||
// waitForOVS polls until the OVS server responds to a connection and an 'echo' | ||
// command. | ||
func waitForOVS(network, addr string) error { | ||
return utilwait.PollImmediate(time.Second, time.Minute, func() (bool, error) { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
glog.V(2).Infof("waiting for OVS to start: %v", err) | ||
return false, nil | ||
} | ||
defer c.Close() | ||
if err := c.Ping(); err != nil { | ||
glog.V(2).Infof("waiting for OVS to start, ping failed: %v", err) | ||
return false, nil | ||
} | ||
return true, nil | ||
}) | ||
} | ||
|
||
// runOVSHealthCheck runs two background loops - one that waits for disconnection | ||
// from the OVS server and then checks healthFn, and one that periodically checks | ||
// healthFn. If healthFn returns false in either of these two cases while the OVS | ||
// server is responsive the node process will terminate. | ||
func runOVSHealthCheck(network, addr string, healthFn func() bool) { | ||
// this loop holds an open socket connection to OVS until it times out, then | ||
// checks for health | ||
go utilwait.Until(func() { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
utilruntime.HandleError(fmt.Errorf("SDN healthcheck unable to connect to OVS server: %v", err)) | ||
return | ||
} | ||
defer c.Close() | ||
|
||
err = c.WaitForDisconnect() | ||
utilruntime.HandleError(fmt.Errorf("SDN healthcheck disconnected from OVS server: %v", err)) | ||
|
||
err = utilwait.PollImmediate(100*time.Millisecond, ovsRecoveryTimeout, func() (bool, error) { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to reconnect to OVS server: %v", err) | ||
return false, nil | ||
} | ||
defer c.Close() | ||
if err := c.Ping(); err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to ping OVS server: %v", err) | ||
return false, nil | ||
} | ||
if !healthFn() { | ||
return false, fmt.Errorf("OVS health check failed") | ||
} | ||
return true, nil | ||
}) | ||
if err != nil { | ||
// If OVS restarts and our health check fails, we exit | ||
// TODO: make openshift-sdn able to reconcile without a restart | ||
glog.Fatalf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err) | ||
} | ||
}, ovsDialTimeout, utilwait.NeverStop) | ||
|
||
// this loop periodically verifies we can still connect to the OVS server and | ||
// is an upper bound on the time we wait before detecting a failed OVS configuartion | ||
go utilwait.Until(func() { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to reconnect to OVS server: %v", err) | ||
return | ||
} | ||
defer c.Close() | ||
if err := c.Ping(); err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to ping OVS server: %v", err) | ||
return | ||
} | ||
if !healthFn() { | ||
glog.Fatalf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err) | ||
} | ||
glog.V(4).Infof("SDN healthcheck succeeded") | ||
}, ovsHealthcheckInterval, utilwait.NeverStop) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
package ovsclient | ||
|
||
import ( | ||
"fmt" | ||
"io" | ||
"io/ioutil" | ||
"net" | ||
"net/rpc" | ||
"net/rpc/jsonrpc" | ||
"time" | ||
) | ||
|
||
// Client is an RPC client for communicating with OVS. | ||
type Client struct { | ||
*rpc.Client | ||
conn net.Conn | ||
} | ||
|
||
// New creates a new Client from a connection. | ||
func New(conn net.Conn) *Client { | ||
return &Client{ | ||
Client: jsonrpc.NewClient(conn), | ||
conn: conn, | ||
} | ||
} | ||
|
||
// DialTimeout dials the provided network and address, and if it responds within | ||
// timeout will return a valid Client. | ||
func DialTimeout(network, addr string, timeout time.Duration) (*Client, error) { | ||
conn, err := net.DialTimeout(network, addr, timeout) | ||
if err != nil { | ||
return nil, err | ||
} | ||
return New(conn), nil | ||
} | ||
|
||
// Ping returns nil if the OVS server responded to an "echo" command. | ||
func (c *Client) Ping() error { | ||
var result interface{} | ||
if err := c.Call("echo", []string{"hello"}, &result); err != nil { | ||
return err | ||
} | ||
return nil | ||
} | ||
|
||
// WaitForDisconnect will block until the provided connection is closed | ||
// and return an error. This consumes the connection. | ||
func (c *Client) WaitForDisconnect() error { | ||
n, err := io.Copy(ioutil.Discard, c.conn) | ||
if err != nil && err != io.EOF { | ||
return err | ||
} | ||
if n > 0 { | ||
return fmt.Errorf("unexpected bytes read waiting for disconnect: %d", n) | ||
} | ||
return nil | ||
} |