package serf
|
|
import (
|
"io"
|
"log"
|
"os"
|
"time"
|
|
"github.com/hashicorp/memberlist"
|
)
|
|
// ProtocolVersionMap is the mapping of Serf delegate protocol versions
|
// to memberlist protocol versions. We mask the memberlist protocols using
|
// our own protocol version.
|
var ProtocolVersionMap map[uint8]uint8
|
|
func init() {
|
ProtocolVersionMap = map[uint8]uint8{
|
5: 2,
|
4: 2,
|
3: 2,
|
2: 2,
|
}
|
}
|
|
// Config is the configuration for creating a Serf instance.
|
type Config struct {
|
// The name of this node. This must be unique in the cluster. If this
|
// is not set, Serf will set it to the hostname of the running machine.
|
NodeName string
|
|
// The tags for this role, if any. This is used to provide arbitrary
|
// key/value metadata per-node. For example, a "role" tag may be used to
|
// differentiate "load-balancer" from a "web" role as parts of the same cluster.
|
// Tags are deprecating 'Role', and instead it acts as a special key in this
|
// map.
|
Tags map[string]string
|
|
// EventCh is a channel that receives all the Serf events. The events
|
// are sent on this channel in proper ordering. Care must be taken that
|
// this channel doesn't block, either by processing the events quick
|
// enough or buffering the channel, otherwise it can block state updates
|
// within Serf itself. If no EventCh is specified, no events will be fired,
|
// but point-in-time snapshots of members can still be retrieved by
|
// calling Members on Serf.
|
EventCh chan<- Event
|
|
// ProtocolVersion is the protocol version to speak. This must be between
|
// ProtocolVersionMin and ProtocolVersionMax.
|
ProtocolVersion uint8
|
|
// BroadcastTimeout is the amount of time to wait for a broadcast
|
// message to be sent to the cluster. Broadcast messages are used for
|
// things like leave messages and force remove messages. If this is not
|
// set, a timeout of 5 seconds will be set.
|
BroadcastTimeout time.Duration
|
|
// LeavePropagateDelay is for our leave (node dead) message to propagate
|
// through the cluster. In particular, we want to stay up long enough to
|
// service any probes from other nodes before they learn about us
|
// leaving and stop probing. Otherwise, we risk getting node failures as
|
// we leave.
|
LeavePropagateDelay time.Duration
|
|
// The settings below relate to Serf's event coalescence feature. Serf
|
// is able to coalesce multiple events into single events in order to
|
// reduce the amount of noise that is sent along the EventCh. For example
|
// if five nodes quickly join, the EventCh will be sent one EventMemberJoin
|
// containing the five nodes rather than five individual EventMemberJoin
|
// events. Coalescence can mitigate potential flapping behavior.
|
//
|
// Coalescence is disabled by default and can be enabled by setting
|
// CoalescePeriod.
|
//
|
// CoalescePeriod specifies the time duration to coalesce events.
|
// For example, if this is set to 5 seconds, then all events received
|
// within 5 seconds that can be coalesced will be.
|
//
|
// QuiescentPeriod specifies the duration of time where if no events
|
// are received, coalescence immediately happens. For example, if
|
// CoalscePeriod is set to 10 seconds but QuiscentPeriod is set to 2
|
// seconds, then the events will be coalesced and dispatched if no
|
// new events are received within 2 seconds of the last event. Otherwise,
|
// every event will always be delayed by at least 10 seconds.
|
CoalescePeriod time.Duration
|
QuiescentPeriod time.Duration
|
|
// The settings below relate to Serf's user event coalescing feature.
|
// The settings operate like above but only affect user messages and
|
// not the Member* messages that Serf generates.
|
UserCoalescePeriod time.Duration
|
UserQuiescentPeriod time.Duration
|
|
// The settings below relate to Serf keeping track of recently
|
// failed/left nodes and attempting reconnects.
|
//
|
// ReapInterval is the interval when the reaper runs. If this is not
|
// set (it is zero), it will be set to a reasonable default.
|
//
|
// ReconnectInterval is the interval when we attempt to reconnect
|
// to failed nodes. If this is not set (it is zero), it will be set
|
// to a reasonable default.
|
//
|
// ReconnectTimeout is the amount of time to attempt to reconnect to
|
// a failed node before giving up and considering it completely gone.
|
//
|
// TombstoneTimeout is the amount of time to keep around nodes
|
// that gracefully left as tombstones for syncing state with other
|
// Serf nodes.
|
ReapInterval time.Duration
|
ReconnectInterval time.Duration
|
ReconnectTimeout time.Duration
|
TombstoneTimeout time.Duration
|
|
// FlapTimeout is the amount of time less than which we consider a node
|
// being failed and rejoining looks like a flap for telemetry purposes.
|
// This should be set less than a typical reboot time, but large enough
|
// to see actual events, given our expected detection times for a failed
|
// node.
|
FlapTimeout time.Duration
|
|
// QueueCheckInterval is the interval at which we check the message
|
// queue to apply the warning and max depth.
|
QueueCheckInterval time.Duration
|
|
// QueueDepthWarning is used to generate warning message if the
|
// number of queued messages to broadcast exceeds this number. This
|
// is to provide the user feedback if events are being triggered
|
// faster than they can be disseminated
|
QueueDepthWarning int
|
|
// MaxQueueDepth is used to start dropping messages if the number
|
// of queued messages to broadcast exceeds this number. This is to
|
// prevent an unbounded growth of memory utilization
|
MaxQueueDepth int
|
|
// MinQueueDepth, if >0 will enforce a lower limit for dropping messages
|
// and then the max will be max(MinQueueDepth, 2*SizeOfCluster). This
|
// defaults to 0 which disables this dynamic sizing feature. If this is
|
// >0 then MaxQueueDepth will be ignored.
|
MinQueueDepth int
|
|
// RecentIntentTimeout is used to determine how long we store recent
|
// join and leave intents. This is used to guard against the case where
|
// Serf broadcasts an intent that arrives before the Memberlist event.
|
// It is important that this not be too short to avoid continuous
|
// rebroadcasting of dead events.
|
RecentIntentTimeout time.Duration
|
|
// EventBuffer is used to control how many events are buffered.
|
// This is used to prevent re-delivery of events to a client. The buffer
|
// must be large enough to handle all "recent" events, since Serf will
|
// not deliver messages that are older than the oldest entry in the buffer.
|
// Thus if a client is generating too many events, it's possible that the
|
// buffer gets overrun and messages are not delivered.
|
EventBuffer int
|
|
// QueryBuffer is used to control how many queries are buffered.
|
// This is used to prevent re-delivery of queries to a client. The buffer
|
// must be large enough to handle all "recent" events, since Serf will not
|
// deliver queries older than the oldest entry in the buffer.
|
// Thus if a client is generating too many queries, it's possible that the
|
// buffer gets overrun and messages are not delivered.
|
QueryBuffer int
|
|
// QueryTimeoutMult configures the default timeout multipler for a query to run if no
|
// specific value is provided. Queries are real-time by nature, where the
|
// reply is time sensitive. As a result, results are collected in an async
|
// fashion, however the query must have a bounded duration. We want the timeout
|
// to be long enough that all nodes have time to receive the message, run a handler,
|
// and generate a reply. Once the timeout is exceeded, any further replies are ignored.
|
// The default value is
|
//
|
// Timeout = GossipInterval * QueryTimeoutMult * log(N+1)
|
//
|
QueryTimeoutMult int
|
|
// QueryResponseSizeLimit and QuerySizeLimit limit the inbound and
|
// outbound payload sizes for queries, respectively. These must fit
|
// in a UDP packet with some additional overhead, so tuning these
|
// past the default values of 1024 will depend on your network
|
// configuration.
|
QueryResponseSizeLimit int
|
QuerySizeLimit int
|
|
// MemberlistConfig is the memberlist configuration that Serf will
|
// use to do the underlying membership management and gossip. Some
|
// fields in the MemberlistConfig will be overwritten by Serf no
|
// matter what:
|
//
|
// * Name - This will always be set to the same as the NodeName
|
// in this configuration.
|
//
|
// * Events - Serf uses a custom event delegate.
|
//
|
// * Delegate - Serf uses a custom delegate.
|
//
|
MemberlistConfig *memberlist.Config
|
|
// LogOutput is the location to write logs to. If this is not set,
|
// logs will go to stderr.
|
LogOutput io.Writer
|
|
// Logger is a custom logger which you provide. If Logger is set, it will use
|
// this for the internal logger. If Logger is not set, it will fall back to the
|
// behavior for using LogOutput. You cannot specify both LogOutput and Logger
|
// at the same time.
|
Logger *log.Logger
|
|
// SnapshotPath if provided is used to snapshot live nodes as well
|
// as lamport clock values. When Serf is started with a snapshot,
|
// it will attempt to join all the previously known nodes until one
|
// succeeds and will also avoid replaying old user events.
|
SnapshotPath string
|
|
// RejoinAfterLeave controls our interaction with the snapshot file.
|
// When set to false (default), a leave causes a Serf to not rejoin
|
// the cluster until an explicit join is received. If this is set to
|
// true, we ignore the leave, and rejoin the cluster on start.
|
RejoinAfterLeave bool
|
|
// EnableNameConflictResolution controls if Serf will actively attempt
|
// to resolve a name conflict. Since each Serf member must have a unique
|
// name, a cluster can run into issues if multiple nodes claim the same
|
// name. Without automatic resolution, Serf merely logs some warnings, but
|
// otherwise does not take any action. Automatic resolution detects the
|
// conflict and issues a special query which asks the cluster for the
|
// Name -> IP:Port mapping. If there is a simple majority of votes, that
|
// node stays while the other node will leave the cluster and exit.
|
EnableNameConflictResolution bool
|
|
// DisableCoordinates controls if Serf will maintain an estimate of this
|
// node's network coordinate internally. A network coordinate is useful
|
// for estimating the network distance (i.e. round trip time) between
|
// two nodes. Enabling this option adds some overhead to ping messages.
|
DisableCoordinates bool
|
|
// KeyringFile provides the location of a writable file where Serf can
|
// persist changes to the encryption keyring.
|
KeyringFile string
|
|
// Merge can be optionally provided to intercept a cluster merge
|
// and conditionally abort the merge.
|
Merge MergeDelegate
|
|
// UserEventSizeLimit is maximum byte size limit of user event `name` + `payload` in bytes.
|
// It's optimal to be relatively small, since it's going to be gossiped through the cluster.
|
UserEventSizeLimit int
|
}
|
|
// Init allocates the subdata structures
|
func (c *Config) Init() {
|
if c.Tags == nil {
|
c.Tags = make(map[string]string)
|
}
|
}
|
|
// DefaultConfig returns a Config struct that contains reasonable defaults
|
// for most of the configurations.
|
func DefaultConfig() *Config {
|
hostname, err := os.Hostname()
|
if err != nil {
|
panic(err)
|
}
|
|
return &Config{
|
NodeName: hostname,
|
BroadcastTimeout: 5 * time.Second,
|
LeavePropagateDelay: 1 * time.Second,
|
EventBuffer: 512,
|
QueryBuffer: 512,
|
LogOutput: os.Stderr,
|
ProtocolVersion: 4,
|
ReapInterval: 15 * time.Second,
|
RecentIntentTimeout: 5 * time.Minute,
|
ReconnectInterval: 30 * time.Second,
|
ReconnectTimeout: 24 * time.Hour,
|
QueueCheckInterval: 30 * time.Second,
|
QueueDepthWarning: 128,
|
MaxQueueDepth: 4096,
|
TombstoneTimeout: 24 * time.Hour,
|
FlapTimeout: 60 * time.Second,
|
MemberlistConfig: memberlist.DefaultLANConfig(),
|
QueryTimeoutMult: 16,
|
QueryResponseSizeLimit: 1024,
|
QuerySizeLimit: 1024,
|
EnableNameConflictResolution: true,
|
DisableCoordinates: false,
|
UserEventSizeLimit: 512,
|
}
|
}
|