SaltStack源码阅读

前面理了下salt-master的启动流程, 这次来看看salt-minion的启动流程.

启动salt-minion方法:

/etc/init.d/salt-minion start

看看/etc/init.d/salt-master逻辑:

$ cat /etc/init.d/salt-master

SALTMINION=/usr/bin/salt-minion
PYTHON=/usr/bin/python
MINION_ARGS="" start() {
echo -n $"Starting salt-minion daemon: "
if [ -f $SUSE_RELEASE ]; then
startproc -f -p /var/run/$SERVICE.pid $SALTMINION -d $MINION_ARGS
rc_status -v
elif [ -e $DEBIAN_VERSION ]; then
if [ -f $LOCKFILE ]; then
echo -n "already started, lock file found"
RETVAL=1
elif $PYTHON $SALTMINION -d $MINION_ARGS >& /dev/null; then
echo -n "OK"
RETVAL=0
fi
else
if [[ ! -z "$(pidofproc -p /var/run/$SERVICE.pid $PROCESS)" ]]; then
RETVAL=$?
echo -n "already running"
else
daemon --check $SERVICE $SALTMINION -d $MINION_ARGS
fi
fi
RETVAL=$?
echo
return $RETVAL
}

继续看看/usr/bin/salt-minion:

$ cat /usr/bin/salt-master

#!/usr/bin/python
'''
This script is used to kick off a salt minion daemon
''' from salt.scripts import salt_minion
from multiprocessing import freeze_support if __name__ == '__main__':
# This handles the bootstrapping code that is included with frozen
# scripts. It is a no-op on unfrozen code.
freeze_support()
salt_minion()

调用salt_minion()方法, 在script.py里:

$ cat scripts.py

def salt_minion():
'''
Start the salt minion.
'''
import salt.cli.daemons
import multiprocessing
if '' in sys.path:
sys.path.remove('') if salt.utils.is_windows():
minion = salt.cli.daemons.Minion()
minion.start()
return if '--disable-keepalive' in sys.argv:
sys.argv.remove('--disable-keepalive')
minion = salt.cli.daemons.Minion()
minion.start()
return # keep one minion subprocess running
while True:
try:
queue = multiprocessing.Queue()
except Exception:
# This breaks in containers
minion = salt.cli.daemons.Minion()
minion.start()
return
process = multiprocessing.Process(target=minion_process, args=(queue,))
process.start()
try:
process.join()
try:
restart_delay = queue.get(block=False)
except Exception:
if process.exitcode == 0:
# Minion process ended naturally, Ctrl+C or --version
break
restart_delay = 60
if restart_delay == 0:
# Minion process ended naturally, Ctrl+C, --version, etc.
break
# delay restart to reduce flooding and allow network resources to close
time.sleep(restart_delay)
except KeyboardInterrupt:
break
# need to reset logging because new minion objects
# cause extra log handlers to accumulate
rlogger = logging.getLogger()
for handler in rlogger.handlers:
rlogger.removeHandler(handler)
logging.basicConfig()

这里启动minion使用了multiprocessing.Process方法, target是minion_process函数, 主要流程是:

def minion_process(queue):
'''
Start a minion process
'''
import salt.cli.daemons
# salt_minion spawns this function in a new process def suicide_when_without_parent(parent_pid):
'''
Have the minion suicide if the parent process is gone NOTE: there is a small race issue where the parent PID could be replace
with another process with the same PID!
'''
while True:
time.sleep(5)
try:
# check pid alive (Unix only trick!)
os.kill(parent_pid, 0)
except OSError:
# forcibly exit, regular sys.exit raises an exception-- which
# isn't sufficient in a thread
os._exit(999)
if not salt.utils.is_windows():
thread = threading.Thread(target=suicide_when_without_parent, args=(os.getppid(),))
thread.start() restart = False
minion = None
try:
minion = salt.cli.daemons.Minion()
minion.start()
except (Exception, SaltClientError, SaltReqTimeoutError, SaltSystemExit) as exc:
log.error('Minion failed to start: ', exc_info=True)
restart = True
except SystemExit as exc:
restart = False if restart is True:
log.warn('** Restarting minion **')
delay = 60
if minion is not None:
if hasattr(minion, 'config'):
delay = minion.config.get('random_reauth_delay', 60)
random_delay = randint(1, delay)
log.info('Sleeping random_reauth_delay of {0} seconds'.format(random_delay))
# preform delay after minion resources have been cleaned
queue.put(random_delay)
else:
queue.put(0)

这里调用了salt.cli.daemons.Minion的start方法, 目标类文件在: ~/salt/cli/daemons.py

class Minion(parsers.MinionOptionParser):
'''
Create a minion server
''' def prepare(self):
'''
Run the preparation sequence required to start a salt minion. If sub-classed, don't **ever** forget to run: super(YourSubClass, self).prepare()
'''
self.parse_args() try:
if self.config['verify_env']:
confd = self.config.get('default_include')
if confd:
# If 'default_include' is specified in config, then use it
if '*' in confd:
# Value is of the form "minion.d/*.conf"
confd = os.path.dirname(confd)
if not os.path.isabs(confd):
# If configured 'default_include' is not an absolute
# path, consider it relative to folder of 'conf_file'
# (/etc/salt by default)
confd = os.path.join(
os.path.dirname(self.config['conf_file']), confd
)
else:
confd = os.path.join(
os.path.dirname(self.config['conf_file']), 'minion.d'
)
v_dirs = [
self.config['pki_dir'],
self.config['cachedir'],
self.config['sock_dir'],
self.config['extension_modules'],
confd,
]
if self.config.get('transport') == 'raet':
v_dirs.append(os.path.join(self.config['pki_dir'], 'accepted'))
v_dirs.append(os.path.join(self.config['pki_dir'], 'pending'))
v_dirs.append(os.path.join(self.config['pki_dir'], 'rejected'))
v_dirs.append(os.path.join(self.config['cachedir'], 'raet'))
verify_env(
v_dirs,
self.config['user'],
permissive=self.config['permissive_pki_access'],
pki_dir=self.config['pki_dir'],
)
logfile = self.config['log_file']
if logfile is not None and not logfile.startswith(('tcp://',
'udp://',
'file://')):
# Logfile is not using Syslog, verify
current_umask = os.umask(0o027)
verify_files([logfile], self.config['user'])
os.umask(current_umask)
except OSError as err:
logger.exception('Failed to prepare salt environment')
sys.exit(err.errno) self.setup_logfile_logger()
logger.info(
'Setting up the Salt Minion "{0}"'.format(
self.config['id']
)
)
migrations.migrate_paths(self.config)
# TODO: AIO core is separate from transport
if self.config['transport'].lower() in ('zeromq', 'tcp'):
# Late import so logging works correctly
import salt.minion
# If the minion key has not been accepted, then Salt enters a loop
# waiting for it, if we daemonize later then the minion could halt
# the boot process waiting for a key to be accepted on the master.
# This is the latest safe place to daemonize
self.daemonize_if_required()
self.set_pidfile()
if isinstance(self.config.get('master'), list):
if self.config.get('master_type') == 'failover':
self.minion = salt.minion.Minion(self.config)
else:
self.minion = salt.minion.MultiMinion(self.config)
else:
self.minion = salt.minion.Minion(self.config)
else:
import salt.daemons.flo
self.daemonize_if_required()
self.set_pidfile()
self.minion = salt.daemons.flo.IofloMinion(self.config) def start(self):
'''
Start the actual minion. If sub-classed, don't **ever** forget to run: super(YourSubClass, self).start() NOTE: Run any required code before calling `super()`.
'''
try:
self.prepare()
if check_user(self.config['user']):
logger.info('The salt minion is starting up')
self.minion.tune_in() #建立publisher
except (KeyboardInterrupt, SaltSystemExit) as exc:
logger.warn('Stopping the Salt Minion')
if isinstance(exc, KeyboardInterrupt):
logger.warn('Exiting on Ctrl-c')
else:
logger.error(str(exc))
finally:
self.shutdown()

在这里minion使用prepare方法准备salt minion参数和环境配置, 通过self.minion.tune_in()尝试建立publisher, 和master建立通信机制, 目标文件在: ~/salt/minion.py

class MultiMinion(MinionBase):
'''
如果minion-conf里配置了多minion, 则会使用MultiMinion来启动minion
Create a multi minion interface, this creates as many minions as are
defined in the master option and binds each minion object to a respective
master.
''' # Multi Master Tune In
def tune_in(self):
'''
Bind to the masters This loop will attempt to create connections to masters it hasn't connected
to yet, but once the initial connection is made it is up to ZMQ to do the
reconnect (don't know of an API to get the state here in salt)
'''
# Fire off all the minion coroutines
self.minions = self._spawn_minions() # serve forever!
self.io_loop.start()

这里tune_in()是关键的步骤, minion链接master, 维护minion Pub/Sub socket通信等等.

先看看tune_in()方法:

# Multi Master Tune In
def tune_in(self):
'''
Bind to the masters This loop will attempt to create connections to masters it hasn't connected
to yet, but once the initial connection is made it is up to ZMQ to do the
reconnect (don't know of an API to get the state here in salt)
'''
# Fire off all the minion coroutines
self.minions = self._spawn_minions() # serve forever!
self.io_loop.start() #启动io_loop异步事件驱动 def _spawn_minions(self):
'''
Spawn all the coroutines which will sign in to masters
'''
if not isinstance(self.opts['master'], list):
log.error(
'Attempting to start a multimaster system with one master')
sys.exit(salt.defaults.exitcodes.EX_GENERIC)
for master in set(self.opts['master']):
s_opts = copy.deepcopy(self.opts)
s_opts['master'] = master
s_opts['multimaster'] = True
s_opts['auth_timeout'] = self.MINION_CONNECT_TIMEOUT
self.io_loop.spawn_callback(self._connect_minion, s_opts) #callback调用_connect_minion def _connect_minion(self, opts):
'''
Create a minion, and asynchronously connect it to a master
'''
last = 0 # never have we signed in
auth_wait = opts['acceptance_wait_time']
while True:
try:
minion = Minion(opts,
self.MINION_CONNECT_TIMEOUT,
False,
io_loop=self.io_loop,
loaded_base_name='salt.loader.{0}'.format(opts['master']),
)
yield minion.connect_master()
minion.tune_in(start=False)
break
except SaltClientError as exc:
log.error('Error while bringing up minion for multi-master. Is master at {0} responding?'.format(opts['master']))
last = time.time()
if auth_wait < self.max_auth_wait:
auth_wait += self.auth_wait
yield tornado.gen.sleep(auth_wait) # TODO: log?
except Exception as e:
log.critical('Unexpected error while connecting to {0}'.format(opts['master']), exc_info=True)

到这里大家会发现tune_in进入循环了... 没错! minion server进程会无限循环下去, 维护minion Pub/Sub socket, 保持和salt master的链接, 监听event事件完成Job工作等等.

接下来看看connect_master()方法:

def connect_master(self):
'''
Return a future which will complete when you are connected to a master
'''
master, self.pub_channel = yield self.eval_master(self.opts, self.timeout, self.safe) yield self._post_master_init(master)

在eval_master里, 如果master_type是func, 则会load所有models; 如果master_type是failover, 则会获取master address list, 进而会调用salt.transport.client.AsyncPubChannel.factory()生成pub_channel, 调用connect()方法链接master主机, 具体逻辑是下面这样的:

def eval_master(self,
opts,
timeout=60,
safe=True,
failed=False): # check if master_type was altered from its default
if opts['master_type'] != 'str':
# check for a valid keyword
if opts['master_type'] == 'func': #func类型
# split module and function and try loading the module
mod, fun = opts['master'].split('.')
try:
master_mod = salt.loader.raw_mod(opts, mod, fun) #load各个modules
if not master_mod:
raise TypeError
# we take whatever the module returns as master address
opts['master'] = master_mod[mod + '.' + fun]() #return master地址
except TypeError:
msg = ('Failed to evaluate master address from '
'module \'{0}\''.format(opts['master']))
log.error(msg)
sys.exit(salt.defaults.exitcodes.EX_GENERIC)
log.info('Evaluated master from module: {0}'.format(master_mod)) # if failover is set, master has to be of type list
elif opts['master_type'] == 'failover': #failover类型
if isinstance(opts['master'], list): '''
使用isinstance方法, failover链接master
'''
log.info('Got list of available master addresses:'
' {0}'.format(opts['master']))
if opts['master_shuffle']:
'''
If master is a list of addresses, shuffle them before trying to connect to distribute the minions over all available masters.
'''
shuffle(opts['master'])
# if opts['master'] is a str and we have never created opts['master_list']
elif isinstance(opts['master'], str) and ('master_list' not in opts):
# We have a string, but a list was what was intended. Convert.
# See issue 23611 for details
opts['master'] = list(opts['master'])
elif opts['__role'] == 'syndic':#二级master
log.info('Syndic setting master_syndic to \'{0}\''.format(opts['master'])) # if failed=True, the minion was previously connected
# we're probably called from the minions main-event-loop
# because a master connection loss was detected. remove
# the possibly failed master from the list of masters.
elif failed:
log.info('Removing possibly failed master {0} from list of'
' masters'.format(opts['master']))
# create new list of master with the possibly failed one removed
opts['master'] = [x for x in opts['master_list'] if opts['master'] != x] else:
msg = ('master_type set to \'failover\' but \'master\' '
'is not of type list but of type '
'{0}'.format(type(opts['master'])))
log.error(msg)
sys.exit(salt.defaults.exitcodes.EX_GENERIC)
else:
msg = ('Invalid keyword \'{0}\' for variable '
'\'master_type\''.format(opts['master_type']))
log.error(msg)
sys.exit(salt.defaults.exitcodes.EX_GENERIC) # if we have a list of masters, loop through them and be
# happy with the first one that allows us to connect
if isinstance(opts['master'], list):
conn = False
# shuffle the masters and then loop through them
local_masters = copy.copy(opts['master']) for master in local_masters:
opts['master'] = master
opts.update(resolve_dns(opts))
super(Minion, self).__init__(opts) # TODO: only run init once?? This will run once per attempt # on first run, update self.opts with the whole master list
# to enable a minion to re-use old masters if they get fixed
if 'master_list' not in opts:
opts['master_list'] = local_masters try:
pub_channel = salt.transport.client.AsyncPubChannel.factory(opts,
timeout=timeout, safe=safe,
io_loop=self.io_loop,
)
yield pub_channel.connect()
conn = True
break
except SaltClientError: '''
链接master失败, 尝试下一个master链接
'''
msg = ('Master {0} could not be reached, trying '
'next master (if any)'.format(opts['master']))
log.info(msg)
continue if not conn:
self.connected = False
msg = ('No master could be reached or all masters denied '
'the minions connection attempt.')
log.error(msg)
else:
self.connected = True
raise tornado.gen.Return((opts['master'], pub_channel)) # single master sign in
else:
opts.update(resolve_dns(opts)) #处理dns解析master ip address
pub_channel = salt.transport.client.AsyncPubChannel.factory(self.opts,
timeout=timeout,
safe=safe,
io_loop=self.io_loop,
)
yield pub_channel.connect()
self.tok = pub_channel.auth.gen_token('salt')
self.connected = True
raise tornado.gen.Return((opts['master'], pub_channel))

可以看出在上面的逻辑中, 无论是multi master还是single master, 最终都会调用salt.transport.client.AsyncPubChannel类的factory()和connect()方法来完成connect master, 目标文件在: ~/salt/transport/client.py里.

class AsyncPubChannel(AsyncChannel):
'''
Factory class to create subscription channels to the master's Publisher
'''
@classmethod
def factory(cls, opts, **kwargs):
# Default to ZeroMQ for now
ttype = 'zeromq' # determine the ttype
if 'transport' in opts:
ttype = opts['transport']
elif 'transport' in opts.get('pillar', {}).get('master', {}):
ttype = opts['pillar']['master']['transport'] # switch on available ttypes
if ttype == 'zeromq':
import salt.transport.zeromq
return salt.transport.zeromq.AsyncZeroMQPubChannel(opts, **kwargs)
elif ttype == 'raet': # TODO:
import salt.transport.raet
return salt.transport.raet.AsyncRAETPubChannel(opts, **kwargs)
elif ttype == 'tcp':
if not cls._resolver:
# TODO: add opt to specify number of resolver threads
AsyncChannel._init_resolver()
import salt.transport.tcp
return salt.transport.tcp.AsyncTCPPubChannel(opts, **kwargs)
elif ttype == 'local': # TODO:
import salt.transport.local
return salt.transport.local.AsyncLocalPubChannel(opts, **kwargs)
else:
raise Exception('Channels are only defined for ZeroMQ and raet')
# return NewKindOfChannel(opts, **kwargs)

现在基本都是基于zeromq通信的, salt.transport.zeromq.AsyncZeroMQPubChannel(opts, **kwargs)方法处理pub_channel, 目标类文件在: ~/salt/transport/zeromq.py

class AsyncZeroMQPubChannel(salt.transport.mixins.auth.AESPubClientMixin, salt.transport.client.AsyncPubChannel):
'''
A transport channel backed by ZeroMQ for a Salt Publisher to use to
publish commands to connected minions
'''
def __init__(self,
opts,
**kwargs):
self.opts = opts
self.ttype = 'zeromq' if 'io_loop' in kwargs:
self.io_loop = kwargs['io_loop']
else:
self.io_loop = tornado.ioloop.IOLoop() #IOLoop网络事件 self.hexid = hashlib.sha1(self.opts['id']).hexdigest() self.auth = salt.crypt.AsyncAuth(self.opts, io_loop=self.io_loop) #异步的网络加密验证 self.serial = salt.payload.Serial(self.opts) self.context = zmq.Context() #ZMQ发布订阅模型(PUB/SUB)
self._socket = self.context.socket(zmq.SUB) #启动zmq的SUB(client)端socket if self.opts['zmq_filtering']:
# TODO: constants file for "broadcast"
self._socket.setsockopt(zmq.SUBSCRIBE, 'broadcast')
self._socket.setsockopt(zmq.SUBSCRIBE, self.hexid)
else:
self._socket.setsockopt(zmq.SUBSCRIBE, '') self._socket.setsockopt(zmq.SUBSCRIBE, '')
self._socket.setsockopt(zmq.IDENTITY, self.opts['id']) #minion自己的conf中的id # TODO: cleanup all the socket opts stuff
if hasattr(zmq, 'TCP_KEEPALIVE'):
self._socket.setsockopt(
zmq.TCP_KEEPALIVE, self.opts['tcp_keepalive']
)
self._socket.setsockopt(
zmq.TCP_KEEPALIVE_IDLE, self.opts['tcp_keepalive_idle']
)
self._socket.setsockopt(
zmq.TCP_KEEPALIVE_CNT, self.opts['tcp_keepalive_cnt']
)
self._socket.setsockopt(
zmq.TCP_KEEPALIVE_INTVL, self.opts['tcp_keepalive_intvl']
) recon_delay = self.opts['recon_default'] if self.opts['recon_randomize']:
recon_delay = randint(self.opts['recon_default'],
self.opts['recon_default'] + self.opts['recon_max']
) log.debug("Generated random reconnect delay between '{0}ms' and '{1}ms' ({2})".format(
self.opts['recon_default'],
self.opts['recon_default'] + self.opts['recon_max'],
recon_delay)
) log.debug("Setting zmq_reconnect_ivl to '{0}ms'".format(recon_delay))
self._socket.setsockopt(zmq.RECONNECT_IVL, recon_delay) if hasattr(zmq, 'RECONNECT_IVL_MAX'):
log.debug("Setting zmq_reconnect_ivl_max to '{0}ms'".format(
self.opts['recon_default'] + self.opts['recon_max'])
) self._socket.setsockopt(
zmq.RECONNECT_IVL_MAX, self.opts['recon_max']
) if self.opts['ipv6'] is True and hasattr(zmq, 'IPV4ONLY'):
# IPv6 sockets work for both IPv6 and IPv4 addresses
self._socket.setsockopt(zmq.IPV4ONLY, 0) if HAS_ZMQ_MONITOR and self.opts['zmq_monitor']:
self._monitor = ZeroMQSocketMonitor(self._socket)
self._monitor.start_io_loop(self.io_loop) # TODO: this is the time to see if we are connected, maybe use the req channel to guess?
@tornado.gen.coroutine
def connect(self):
if not self.auth.authenticated: #判断是否auth验证
yield self.auth.authenticate()
self.publish_port = self.auth.creds['publish_port']
self._socket.connect(self.master_pub) #连接master

看到这里也就明白了, minion在启动后最终使用了ZMQ库的Pub/Sub模型, connect方法链接master机器.

至此salt-minion算是启动起来了 -_-


先去喝杯水, 我想静静 -

05-28 13:36