-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Description
I implemented a raft cluster function using golang through the module "github.com/hashicorp/raft" and found a problem in the following scenario:
There are currently 2 raft clusters, the cluster names, cluster nodes and IP addresses are as follows: (Raft clusters are initialized through the BootstrapCluster method)
Cluster1 BootstrapCluster servers:
- node1: {raft.ServerID: c1-node1, raft.ServerAddress: 192.168.100.1:7000}
- node2: {raft.ServerID: c1-node2, raft.ServerAddress: 192.168.100.2:7000}
- node3: {raft.ServerID: c1-node3, raft.ServerAddress: 192.168.100.3:7000}
Cluster2 BootstrapCluster servers:
- node3: {raft.ServerID: c2-node3, raft.ServerAddress: 192.168.100.3:7000}
- node4: {raft.ServerID: c2-node4, raft.ServerAddress: 192.168.100.4:7000}
- node5: {raft.ServerID: c2-node5, raft.ServerAddress: 192.168.100.5:7000}
Among them, "node1" and "node2" are started according to "Cluster1":
sudo ./raft_svr -cluster 'c1-node1,127.0.0.1,800;c1-node2,127.0.0.2,800;c1-node3,127.0.0.3,800' -id c1-node1
sudo ./raft_svr -cluster 'c1-node1,127.0.0.1,800;c1-node2,127.0.0.2,800;c1-node3,127.0.0.3,800' -id c1-node2
"node3","node4","node5" first start according to "Cluster2":
sudo ./raft_svr -cluster 'c2-node3,127.0.0.3,800;c2-node4,127.0.0.4,800;c2-node5,127.0.0.5,800' -id c2-node3
sudo ./raft_svr -cluster 'c2-node3,127.0.0.3,800;c2-node4,127.0.0.4,800;c2-node5,127.0.0.5,800' -id c2-node4
sudo ./raft_svr -cluster 'c2-node3,127.0.0.3,800;c2-node4,127.0.0.4,800;c2-node5,127.0.0.5,800' -id c2-node5
Then you will find that "node3" will switch back and forth between "Cluster1" and "Cluster2", sometimes belonging to "Cluster1" and sometimes belonging to "Cluster2".
INFO[0170] current state:Follower, servers:[{Suffrage:Voter ID:c2-node3 Address:127.0.0.3:800} {Suffrage:Voter ID:c2-node4 Address:127.0.0.4:800} {Suffrage:Voter ID:c2-node5 Address:127.0.0.5:800}], leader address:127.0.0.5:800, last contact:2025-05-14 15:35:53.330867 +0800 CST m=+169.779019126
INFO[0171] current state:Follower, servers:[{Suffrage:Voter ID:c2-node3 Address:127.0.0.3:800} {Suffrage:Voter ID:c2-node4 Address:127.0.0.4:800} {Suffrage:Voter ID:c2-node5 Address:127.0.0.5:800}], leader address:127.0.0.1:800, last contact:2025-05-14 15:35:54.308388 +0800 CST m=+170.756576126
Is this situation expected?
here is my code:
package main
import (
"flag"
"fmt"
"io"
"net"
"os"
"strconv"
"strings"
"time"
"github.com/hashicorp/raft" // github.com/hashicorp/raft v1.7.3
log "github.com/sirupsen/logrus"
)
type raftCluster struct {
localRaftID raft.ServerID
servers map[raft.ServerID]raft.ServerAddress // raftID : raftAddressPort
raft *raft.Raft
electionTimeout time.Duration
}
func (r *raftCluster) Start() error {
config := raft.DefaultConfig()
config.HeartbeatTimeout = 2000 * time.Millisecond
config.ElectionTimeout = 5000 * time.Millisecond
config.CommitTimeout = 2000 * time.Millisecond
config.LeaderLeaseTimeout = 2000 * time.Millisecond
config.LocalID = r.localRaftID
config.LogOutput = log.StandardLogger().Out
r.electionTimeout = config.ElectionTimeout * time.Duration(len(r.servers)*2)
localAddressPort := string(r.servers[r.localRaftID])
tcpAddr, err := net.ResolveTCPAddr("tcp", localAddressPort)
if err != nil {
return fmt.Errorf("resolve tcp address %s, %v", localAddressPort, err)
}
transport, err := raft.NewTCPTransport(localAddressPort, tcpAddr, 2, 10*time.Second, log.StandardLogger().Out)
if err != nil {
return fmt.Errorf("fail to create tcp transport, localAddressPort:%s, tcpAddr:%v, %v",
localAddressPort, tcpAddr, err)
}
snapshots := raft.NewInmemSnapshotStore()
logStore := raft.NewInmemStore()
stableStore := raft.NewInmemStore()
fm := NewFsm()
r.raft, err = raft.NewRaft(config, fm, logStore, stableStore, snapshots, transport)
if err != nil {
return fmt.Errorf("create raft error, %v", err)
}
var configuration raft.Configuration
for sID, addr := range r.servers {
server := raft.Server{
ID: sID,
Address: addr,
}
configuration.Servers = append(configuration.Servers, server)
}
err = r.raft.BootstrapCluster(configuration).Error()
if err != nil {
return fmt.Errorf("raft bootstrap faild, conf:%v, %v", configuration, err)
}
log.Infof("bootstrap cluster as config: %v", configuration)
return nil
}
func (r *raftCluster) checkLeaderState() {
ticker := time.NewTicker(time.Second)
for {
select {
case leader := <-r.raft.LeaderCh():
log.Infof("im leader:%v, state:%s, leader address:%s", leader, r.raft.State(), r.raft.Leader())
case <-ticker.C:
verifyErr := r.raft.VerifyLeader().Error()
servers := r.raft.GetConfiguration().Configuration().Servers
switch verifyErr {
case nil:
log.Infof("im leader, servers:%v", servers)
case raft.ErrNotLeader:
// check cluster leader
log.Infof("current state:%v, servers:%+v, leader address:%v, last contact:%v",
r.raft.State(), servers, r.raft.Leader(), r.raft.LastContact())
}
}
}
}
func main() {
var (
clusters = flag.String("cluster", "",
"cluster node address, fmt: ID,IP,Port;ID,IP,Port")
clusterId = flag.String("id", "", "cluster id")
)
flag.Parse()
if *clusterId == "" {
log.Infof("cluster id messing")
os.Exit(1)
}
servers := make(map[raft.ServerID]raft.ServerAddress)
for _, cluster := range strings.Split(*clusters, ";") {
info := strings.Split(cluster, ",")
var (
nid string
nip net.IP
nport int
err error
)
switch {
case len(info) == 3:
nid = info[0]
nip = net.ParseIP(info[1])
if nip == nil {
log.Infof("cluster %s ip %s parse failed", cluster, info[1])
os.Exit(1)
}
nport, err = strconv.Atoi(info[2])
if err != nil {
log.Infof("cluster %s port %s parse failed, %v", cluster, info[2], err)
}
default:
log.Infof("cluster args value is bad format")
os.Exit(1)
}
log.Infof("cluster node id:%s, ip:%v, port:%d", nid, nip, nport)
addr := net.TCPAddr{IP: nip, Port: nport}
servers[raft.ServerID(nid)] = raft.ServerAddress(addr.String())
}
r := raftCluster{
localRaftID: raft.ServerID(*clusterId),
servers: servers,
}
err := r.Start()
if err != nil {
log.Infof("rafter cluster start failed, %v", err)
os.Exit(1)
}
r.checkLeaderState()
}
// SimpleFsm: 实现一个简单的Fsm
type SimpleFsm struct {
db database
}
func NewFsm() *SimpleFsm {
fsm := &SimpleFsm{
db: NewDatabase(),
}
return fsm
}
func (f *SimpleFsm) Apply(l *raft.Log) interface{} {
return nil
}
func (f *SimpleFsm) Snapshot() (raft.FSMSnapshot, error) {
return &f.db, nil
}
func (f *SimpleFsm) Restore(io.ReadCloser) error {
return nil
}
type database struct{}
func NewDatabase() database {
return database{}
}
func (d *database) Get(key string) string {
return "not implemented"
}
func (d *database) Set(key, value string) {}
func (d *database) Persist(sink raft.SnapshotSink) error {
_, _ = sink.Write([]byte{})
_ = sink.Close()
return nil
}
func (d *database) Release() {}