使用 Python API 的 Ansible Playbook 重试逻辑
Ansible Playbook Retry Logic Using Python API
使用 Ansible 2 Python API 我能够 运行 剧本并使用自定义回调处理程序处理结果(感谢 this question)。一切正常,但现在我想为 PlaybookExecutor 实现一个简单的重试循环。
我的回调处理程序所做的就是将所有失败的任务填充到数组中,如果我发现数组不为空,则将其视为失败并重试。
我有另一个 python 模块使用此脚本启动剧本。对 run_playbook 的调用嵌套在 try/except 块中,我想要一个异常冒泡以便我可以正确处理失败。
我想在 运行ning 尝试我的剧本 3 次,如果全部失败则引发异常。
这是我的代码:
#! /usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
import logging
import os
from collections import namedtuple
from ansible.parsing.dataloader import DataLoader
from ansible.vars.manager import VariableManager
from ansible.inventory.manager import InventoryManager
from ansible.executor.playbook_executor import PlaybookExecutor
from ansible.plugins.callback import CallbackBase
class ResultsCallback(CallbackBase):
""" A callback plugin used for performing an action as results come in """
def __init__(self):
super(ResultsCallback, self).__init__()
# Store all failed results
self.failed = []
def v2_runner_on_failed(self, result, ignore_errors=False):
if ignore_errors:
self._display.display("...ignoring", color=C.COLOR_SKIP)
host = result._host
self.failed.append(result.task_name)
def create_inventory_file(hostnames):
inv_file = 'ansible_hosts.{0}'.format(os.getppid())
logging.print('\nCreating Ansible host file: {0}/{1}'.format(os.path.join(os.path.expanduser('~')), inv_file))
with open(os.path.join(os.path.expanduser('~'), inv_file), 'w') as host_file:
# If ec2, stuff into an '[ec2]' group.
# Otherwise don't use a group header
if 'ec2' in hostnames[0]:
host_file.write('[ec2]\n')
for host in hostnames:
host_file.write('{0}\n'.format(host))
return os.path.join(os.path.expanduser('~'), inv_file)
def run_playbook(hostnames, playbook, playbook_arguments, host_file=False):
# If user passes in the optional arg host_file, then just use that one.
if not host_file:
host_file = create_inventory_file(hostnames)
if not os.path.isfile(host_file):
logging.critical('Host file does not exist. Make sure absolute path is correct.\nInventory: {0}'.format(host_file))
raise RuntimeError('Host file does not exist')
loader = DataLoader()
inventory = InventoryManager(loader=loader, sources=host_file)
variable_manager = VariableManager(loader=loader, inventory=inventory)
# Add extra variables to use in playbook like so:
# variable_manager.extra_vars = {'name': 'value'}
if playbook_arguments:
variable_manager.extra_vars = playbook_arguments
Options = namedtuple('Options', ['listtags', 'listtasks', 'listhosts', 'syntax', 'connection','module_path', 'forks', 'remote_user', 'become', 'become_method', 'become_user', 'verbosity', 'check', 'diff', 'ask_sudo_pass'])
if 'superuser' in playbook_arguments:
remote_user = playbook_arguments['superuser']
else:
remote_user = 'ec2-user'
options = Options(listtags=None, listtasks=None, listhosts=None, syntax=None, connection='smart', module_path=None, forks=100, remote_user=remote_user, become=None, become_method='sudo', become_user='root', verbosity=None, check=False, diff=False, ask_sudo_pass=None)
pbex = PlaybookExecutor(playbooks=[playbook], inventory=inventory, variable_manager=variable_manager, loader=loader, options=options, passwords={})
callback = ResultsCallback()
pbex._tqm._stdout_callback = callback
logging.print('Provisioning cluster with Ansible...')
attempts = 3
for i in range(attempts):
try:
pbex.run()
failed = callback.failed
if failed:
logging.critical('Playbook failed!')
raise RuntimeError('{0} tasks failed'.format(len(failed)))
break
except:
if i < attempts - 1:
logging.critical('Attempting to re-try playbook')
continue
else:
raise
logging.print('\nRemoving Ansible Inventory file {0}'.format(host_file))
try:
os.remove(host_file)
except OSError:
pass
但是,当我使用肯定会失败的剧本测试上述代码时,它失败并出现以下回溯:
Creating Ansible host file: /home/someuser/ansible_hosts.18600
Provisioning cluster with Ansible...
Playbook failed!
Attempting to re-try playbook
Exception during setup; tearing down all created instances
Traceback (most recent call last):
File "./manage_aws.py", line 486, in cmd_ec2_create
manage_ansible.run_playbook(hostnames, playbook, playbook_arguments)
File "/home/someuser/manage_ansible.py", line 88, in run_playbook
break
File "/usr/local/lib/python2.7/dist-packages/ansible/executor/playbook_executor.py", line 159, in run
result = self._tqm.run(play=play)
File "/usr/local/lib/python2.7/dist-packages/ansible/executor/task_queue_manager.py", line 296, in run
strategy.cleanup()
File "/usr/local/lib/python2.7/dist-packages/ansible/plugins/strategy/__init__.py", line 223, in cleanup
self._final_q.put(_sentinel)
File "/usr/lib/python2.7/multiprocessing/queues.py", line 100, in put
assert not self._closed
AssertionError
您会注意到调用脚本 manage_aws.py ("Exception during setup; tearing down all created instances") 中正确捕获了异常,我们将拆除这些实例。太好了,但我想在决定这样做之前正确地重新尝试剧本。
我不是 Python 大师,所以如果有人有任何提示,或者已经完成了类似的事情,那么我将非常感谢您的建议。
提前致谢!
我找到了一个解决方案,虽然它不像我希望的那样优雅。
我遇到的问题似乎与在同一个 PlaybokExecutor 对象上重新运行有关,而没有让生成的线程正确清理。
当我注意到第一个失败时,我只是初始化了一个新的 PlaybookExecutor 对象来修复它。当前的实现只允许重试一次,这很好,但我很可能会根据需要对其进行调整以执行更多操作。
这是我修改后的重试逻辑:
pbex = PlaybookExecutor(playbooks=[playbook], inventory=inventory, variable_manager=variable_manager, loader=loader, options=options, passwords={})
callback = ResultsCallback()
pbex._tqm._stdout_callback = callback
logging.print('Provisioning cluster with Ansible...')
pbex.run()
failed = callback.failed
if failed:
logging.critical('Playbook failed! Attempting retry...')
pbex_retry = PlaybookExecutor(playbooks=[playbook], inventory=inventory, variable_manager=variable_manager, loader=loader, options=options, passwords={})
callback_retry = ResultsCallback()
pbex_retry._tqm._stdout_callback = callback_retry
pbex_retry.run()
failed_retry = callback_retry.failed
if failed_retry:
logging.critical('Playbook failed again! Failed on task:\n{0}'.format(failed_retry[0]))
remove_inventory_file(host_file)
raise RuntimeError('Playbook failed to successfully configure the cluster.')
remove_inventory_file(host_file)
超级简单的解决方案,但遗憾的是我最初的尝试没有达到预期效果。也许我会重新访问它并尝试在失败时正确清理执行程序。
使用 Ansible 2 Python API 我能够 运行 剧本并使用自定义回调处理程序处理结果(感谢 this question)。一切正常,但现在我想为 PlaybookExecutor 实现一个简单的重试循环。
我的回调处理程序所做的就是将所有失败的任务填充到数组中,如果我发现数组不为空,则将其视为失败并重试。
我有另一个 python 模块使用此脚本启动剧本。对 run_playbook 的调用嵌套在 try/except 块中,我想要一个异常冒泡以便我可以正确处理失败。
我想在 运行ning 尝试我的剧本 3 次,如果全部失败则引发异常。
这是我的代码:
#! /usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
import logging
import os
from collections import namedtuple
from ansible.parsing.dataloader import DataLoader
from ansible.vars.manager import VariableManager
from ansible.inventory.manager import InventoryManager
from ansible.executor.playbook_executor import PlaybookExecutor
from ansible.plugins.callback import CallbackBase
class ResultsCallback(CallbackBase):
""" A callback plugin used for performing an action as results come in """
def __init__(self):
super(ResultsCallback, self).__init__()
# Store all failed results
self.failed = []
def v2_runner_on_failed(self, result, ignore_errors=False):
if ignore_errors:
self._display.display("...ignoring", color=C.COLOR_SKIP)
host = result._host
self.failed.append(result.task_name)
def create_inventory_file(hostnames):
inv_file = 'ansible_hosts.{0}'.format(os.getppid())
logging.print('\nCreating Ansible host file: {0}/{1}'.format(os.path.join(os.path.expanduser('~')), inv_file))
with open(os.path.join(os.path.expanduser('~'), inv_file), 'w') as host_file:
# If ec2, stuff into an '[ec2]' group.
# Otherwise don't use a group header
if 'ec2' in hostnames[0]:
host_file.write('[ec2]\n')
for host in hostnames:
host_file.write('{0}\n'.format(host))
return os.path.join(os.path.expanduser('~'), inv_file)
def run_playbook(hostnames, playbook, playbook_arguments, host_file=False):
# If user passes in the optional arg host_file, then just use that one.
if not host_file:
host_file = create_inventory_file(hostnames)
if not os.path.isfile(host_file):
logging.critical('Host file does not exist. Make sure absolute path is correct.\nInventory: {0}'.format(host_file))
raise RuntimeError('Host file does not exist')
loader = DataLoader()
inventory = InventoryManager(loader=loader, sources=host_file)
variable_manager = VariableManager(loader=loader, inventory=inventory)
# Add extra variables to use in playbook like so:
# variable_manager.extra_vars = {'name': 'value'}
if playbook_arguments:
variable_manager.extra_vars = playbook_arguments
Options = namedtuple('Options', ['listtags', 'listtasks', 'listhosts', 'syntax', 'connection','module_path', 'forks', 'remote_user', 'become', 'become_method', 'become_user', 'verbosity', 'check', 'diff', 'ask_sudo_pass'])
if 'superuser' in playbook_arguments:
remote_user = playbook_arguments['superuser']
else:
remote_user = 'ec2-user'
options = Options(listtags=None, listtasks=None, listhosts=None, syntax=None, connection='smart', module_path=None, forks=100, remote_user=remote_user, become=None, become_method='sudo', become_user='root', verbosity=None, check=False, diff=False, ask_sudo_pass=None)
pbex = PlaybookExecutor(playbooks=[playbook], inventory=inventory, variable_manager=variable_manager, loader=loader, options=options, passwords={})
callback = ResultsCallback()
pbex._tqm._stdout_callback = callback
logging.print('Provisioning cluster with Ansible...')
attempts = 3
for i in range(attempts):
try:
pbex.run()
failed = callback.failed
if failed:
logging.critical('Playbook failed!')
raise RuntimeError('{0} tasks failed'.format(len(failed)))
break
except:
if i < attempts - 1:
logging.critical('Attempting to re-try playbook')
continue
else:
raise
logging.print('\nRemoving Ansible Inventory file {0}'.format(host_file))
try:
os.remove(host_file)
except OSError:
pass
但是,当我使用肯定会失败的剧本测试上述代码时,它失败并出现以下回溯:
Creating Ansible host file: /home/someuser/ansible_hosts.18600
Provisioning cluster with Ansible...
Playbook failed!
Attempting to re-try playbook
Exception during setup; tearing down all created instances
Traceback (most recent call last):
File "./manage_aws.py", line 486, in cmd_ec2_create
manage_ansible.run_playbook(hostnames, playbook, playbook_arguments)
File "/home/someuser/manage_ansible.py", line 88, in run_playbook
break
File "/usr/local/lib/python2.7/dist-packages/ansible/executor/playbook_executor.py", line 159, in run
result = self._tqm.run(play=play)
File "/usr/local/lib/python2.7/dist-packages/ansible/executor/task_queue_manager.py", line 296, in run
strategy.cleanup()
File "/usr/local/lib/python2.7/dist-packages/ansible/plugins/strategy/__init__.py", line 223, in cleanup
self._final_q.put(_sentinel)
File "/usr/lib/python2.7/multiprocessing/queues.py", line 100, in put
assert not self._closed
AssertionError
您会注意到调用脚本 manage_aws.py ("Exception during setup; tearing down all created instances") 中正确捕获了异常,我们将拆除这些实例。太好了,但我想在决定这样做之前正确地重新尝试剧本。
我不是 Python 大师,所以如果有人有任何提示,或者已经完成了类似的事情,那么我将非常感谢您的建议。
提前致谢!
我找到了一个解决方案,虽然它不像我希望的那样优雅。
我遇到的问题似乎与在同一个 PlaybokExecutor 对象上重新运行有关,而没有让生成的线程正确清理。
当我注意到第一个失败时,我只是初始化了一个新的 PlaybookExecutor 对象来修复它。当前的实现只允许重试一次,这很好,但我很可能会根据需要对其进行调整以执行更多操作。
这是我修改后的重试逻辑:
pbex = PlaybookExecutor(playbooks=[playbook], inventory=inventory, variable_manager=variable_manager, loader=loader, options=options, passwords={})
callback = ResultsCallback()
pbex._tqm._stdout_callback = callback
logging.print('Provisioning cluster with Ansible...')
pbex.run()
failed = callback.failed
if failed:
logging.critical('Playbook failed! Attempting retry...')
pbex_retry = PlaybookExecutor(playbooks=[playbook], inventory=inventory, variable_manager=variable_manager, loader=loader, options=options, passwords={})
callback_retry = ResultsCallback()
pbex_retry._tqm._stdout_callback = callback_retry
pbex_retry.run()
failed_retry = callback_retry.failed
if failed_retry:
logging.critical('Playbook failed again! Failed on task:\n{0}'.format(failed_retry[0]))
remove_inventory_file(host_file)
raise RuntimeError('Playbook failed to successfully configure the cluster.')
remove_inventory_file(host_file)
超级简单的解决方案,但遗憾的是我最初的尝试没有达到预期效果。也许我会重新访问它并尝试在失败时正确清理执行程序。