Beam - 在 Beam 管道的开始和结束时仅对 运行 起作用一次
Beam - Functions to run only once at the start and end of a Beam Pipeline
我有一个查询 BigQuery 然后将结果上传到 BigTable 的 Beam 管道。我想在我的管道开始之前扩展我的 BigTable 实例(从 1 个节点到 10 个节点),然后在结果加载到 BigTable 之后缩减(从 10 个节点到 1 个节点)。有什么机制可以用 Beam 做到这一点吗?
我基本上想要两个单独的转换,一个在管道的开头,一个在结尾,分别放大和缩小节点。或者,有一个 DoFn
只触发一名工人的 setup()
和 teardown()
。
我尝试使用 DoFn
生命周期函数的 setup()
和 teardown()
。但是,这些函数每个工作人员执行一次(我使用了数百个工作人员),因此它将多次尝试扩大和缩小 BigTable(并达到当天的实例和集群写入配额)。所以这并不真正适用于我的用例。无论如何,这是我一直在试验的 BigTableWriteFn 的片段:
class _BigTableWriteFn(beam.DoFn):
def __init__(self, project_id, instance_id, table_id, cluster_id, node_count):
beam.DoFn.__init__(self)
self.beam_options = {
'project_id': project_id,
'instance_id': instance_id,
'table_id': table_id,
'cluster_id': cluster_id,
'node_count': node_count
}
self.table = None
self.initial_node_count = None
self.batcher = None
self.written = Metrics.counter(self.__class__, 'Written Row')
def setup(self):
client = Client(project=self.beam_options['project_id'].get(), admin=True)
instance = client.instance(self.beam_options['instance_id'].get())
node_count = self.beam_options['node_count'].get()
cluster = instance.cluster(self.beam_options['cluster_id'].get())
self.initial_node_count = cluster.serve_nodes
if node_count != self.initial_node_count: # I realize this logic is flawed since the cluster.serve_nodes will change after the first setup() call, but I first thought setup() and teardown() was run once for the whole transform...
cluster.serve_nodes = node_count
cluster.update()
## other life cycle methods in between but aren't important to the question
def teardown(self):
client = Client(project=self.beam_options['project_id'].get(), admin=True)
instance = client.instance(self.beam_options['instance_id'].get())
cluster = instance.cluster(self.beam_options['cluster_id'].get())
if cluster.serve_nodes != self.initial_node_count: # I realize this logic is flawed since the cluster.serve_nodes will change after the first setup() call, but I first thought setup() and teardown() was run once for the whole transform...
cluster.serve_nodes = self.initial_node_count
cluster.update()
我还在使用 RuntimeValueProvider 参数作为 bigtable ids(project_id、instance_id、cluster_id 等),所以我觉得无论我做什么类型的转换来扩展我需要使用 DoFn
.
如有任何帮助,我们将不胜感激!
如果您 运行 数据流作业不是作为模板而是作为 VM 或 pod 中的 jar,那么您可以在管道启动之前和之后执行此操作,方法是从以下位置执行 bash 命令java。参考这个 -
要执行的命令-
gcloud bigtable clusters update CLUSTER_ID --instance=INSTANCE_ID --num-nodes=NUM_NODES
但是,如果您 运行 作为模板,那么模板文件将不会考虑除管道开始和结束之间的内容以外的任何内容
所以我想出了一个 hacky 方法,但它有效。
在我的 WriteFn 的 setup()
期间,我得到 clusters.serve_nodes 计数(这在第一个工作人员调用 setup()
后显然会改变)并扩展集群,如果它不是所需计数。在 process()
函数中,我产生了这个计数。然后我执行 beam.CombineGlobally
并找到这些计数的 Smallest(1)
。然后我将其传递给另一个 DoFn
,它将集群扩展到最小数量。
这是我正在做的一些代码片段。
class _BigTableWriteFn(beam.DoFn):
""" Creates the connector can call and add_row to the batcher using each
row in beam pipe line
"""
def __init__(self, project_id, instance_id, table_id, cluster_id, node_count):
""" Constructor of the Write connector of Bigtable
Args:
project_id(str): GCP Project of to write the Rows
instance_id(str): GCP Instance to write the Rows
table_id(str): GCP Table to write the `DirectRows`
cluster_id(str): GCP Cluster to write the scale
node_count(int): Number of nodes to scale to before writing
"""
beam.DoFn.__init__(self)
self.beam_options = {
'project_id': project_id,
'instance_id': instance_id,
'table_id': table_id,
'cluster_id': cluster_id,
'node_count': node_count
}
self.table = None
self.current_node_count = None
self.batcher = None
self.written = Metrics.counter(self.__class__, 'Written Row')
def __getstate__(self):
return self.beam_options
def __setstate__(self, options):
self.beam_options = options
self.table = None
self.current_node_count = None
self.batcher = None
self.written = Metrics.counter(self.__class__, 'Written Row')
def setup(self):
client = Client(project=self.beam_options['project_id'].get(), admin=True)
instance = client.instance(self.beam_options['instance_id'].get())
cluster = instance.cluster(self.beam_options['cluster_id'].get())
cluster.reload()
desired_node_count = self.beam_options['node_count'].get()
self.current_node_count = cluster.serve_nodes
if desired_node_count != self.current_node_count:
cluster.serve_nodes = desired_node_count
cluster.update()
def start_bundle(self):
if self.table is None:
client = Client(project=self.beam_options['project_id'].get())
instance = client.instance(self.beam_options['instance_id'].get())
self.table = instance.table(self.beam_options['table_id'].get())
self.batcher = self.table.mutations_batcher()
def process(self, row):
self.written.inc()
# You need to set the timestamp in the cells in this row object,
# when we do a retry we will mutating the same object, but, with this
# we are going to set our cell with new values.
# Example:
# direct_row.set_cell('cf1',
# 'field1',
# 'value1',
# timestamp=datetime.datetime.now())
self.batcher.mutate(row)
# return the initial node count so we can find the minimum value and scale down BigTable latter
if self.current_node_count:
yield self.current_node_count
def finish_bundle(self):
self.batcher.flush()
self.batcher = None
class _BigTableScaleNodes(beam.DoFn):
def __init__(self, project_id, instance_id, cluster_id):
""" Constructor of the Scale connector of Bigtable
Args:
project_id(str): GCP Project of to write the Rows
instance_id(str): GCP Instance to write the Rows
cluster_id(str): GCP Cluster to write the scale
"""
beam.DoFn.__init__(self)
self.beam_options = {
'project_id': project_id,
'instance_id': instance_id,
'cluster_id': cluster_id,
}
self.cluster = None
def setup(self):
if self.cluster is None:
client = Client(project=self.beam_options['project_id'].get(), admin=True)
instance = client.instance(self.beam_options['instance_id'].get())
self.cluster = instance.cluster(self.beam_options['cluster_id'].get())
def process(self, min_node_counts):
if len(min_node_counts) > 0 and self.cluster.serve_nodes != min_node_counts[0]:
self.cluster.serve_nodes = min_node_counts[0]
self.cluster.update()
def run():
custom_options = PipelineOptions().view_as(CustomOptions)
pipeline_options = PipelineOptions()
p = beam.Pipeline(options=pipeline_options)
(p
| 'Query BigQuery' >> beam.io.Read(beam.io.BigQuerySource(query=QUERY, use_standard_sql=True))
| 'Map Query Results to BigTable Rows' >> beam.Map(to_direct_rows)
| 'Write BigTable Rows' >> beam.ParDo(_BigTableWriteFn(
custom_options.bigtable_project_id,
custom_options.bigtable_instance_id,
custom_options.bigtable_table_id,
custom_options.bigtable_cluster_id,
custom_options.bigtable_node_count))
| 'Find Global Min Node Count' >> beam.CombineGlobally(beam.combiners.Smallest(1))
| 'Scale Down BigTable' >> beam.ParDo(_BigTableScaleNodes(
custom_options.bigtable_project_id,
custom_options.bigtable_instance_id,
custom_options.bigtable_cluster_id))
)
result = p.run()
result.wait_until_finish()
我有一个查询 BigQuery 然后将结果上传到 BigTable 的 Beam 管道。我想在我的管道开始之前扩展我的 BigTable 实例(从 1 个节点到 10 个节点),然后在结果加载到 BigTable 之后缩减(从 10 个节点到 1 个节点)。有什么机制可以用 Beam 做到这一点吗?
我基本上想要两个单独的转换,一个在管道的开头,一个在结尾,分别放大和缩小节点。或者,有一个 DoFn
只触发一名工人的 setup()
和 teardown()
。
我尝试使用 DoFn
生命周期函数的 setup()
和 teardown()
。但是,这些函数每个工作人员执行一次(我使用了数百个工作人员),因此它将多次尝试扩大和缩小 BigTable(并达到当天的实例和集群写入配额)。所以这并不真正适用于我的用例。无论如何,这是我一直在试验的 BigTableWriteFn 的片段:
class _BigTableWriteFn(beam.DoFn):
def __init__(self, project_id, instance_id, table_id, cluster_id, node_count):
beam.DoFn.__init__(self)
self.beam_options = {
'project_id': project_id,
'instance_id': instance_id,
'table_id': table_id,
'cluster_id': cluster_id,
'node_count': node_count
}
self.table = None
self.initial_node_count = None
self.batcher = None
self.written = Metrics.counter(self.__class__, 'Written Row')
def setup(self):
client = Client(project=self.beam_options['project_id'].get(), admin=True)
instance = client.instance(self.beam_options['instance_id'].get())
node_count = self.beam_options['node_count'].get()
cluster = instance.cluster(self.beam_options['cluster_id'].get())
self.initial_node_count = cluster.serve_nodes
if node_count != self.initial_node_count: # I realize this logic is flawed since the cluster.serve_nodes will change after the first setup() call, but I first thought setup() and teardown() was run once for the whole transform...
cluster.serve_nodes = node_count
cluster.update()
## other life cycle methods in between but aren't important to the question
def teardown(self):
client = Client(project=self.beam_options['project_id'].get(), admin=True)
instance = client.instance(self.beam_options['instance_id'].get())
cluster = instance.cluster(self.beam_options['cluster_id'].get())
if cluster.serve_nodes != self.initial_node_count: # I realize this logic is flawed since the cluster.serve_nodes will change after the first setup() call, but I first thought setup() and teardown() was run once for the whole transform...
cluster.serve_nodes = self.initial_node_count
cluster.update()
我还在使用 RuntimeValueProvider 参数作为 bigtable ids(project_id、instance_id、cluster_id 等),所以我觉得无论我做什么类型的转换来扩展我需要使用 DoFn
.
如有任何帮助,我们将不胜感激!
如果您 运行 数据流作业不是作为模板而是作为 VM 或 pod 中的 jar,那么您可以在管道启动之前和之后执行此操作,方法是从以下位置执行 bash 命令java。参考这个 -
要执行的命令-
gcloud bigtable clusters update CLUSTER_ID --instance=INSTANCE_ID --num-nodes=NUM_NODES
但是,如果您 运行 作为模板,那么模板文件将不会考虑除管道开始和结束之间的内容以外的任何内容
所以我想出了一个 hacky 方法,但它有效。
在我的 WriteFn 的 setup()
期间,我得到 clusters.serve_nodes 计数(这在第一个工作人员调用 setup()
后显然会改变)并扩展集群,如果它不是所需计数。在 process()
函数中,我产生了这个计数。然后我执行 beam.CombineGlobally
并找到这些计数的 Smallest(1)
。然后我将其传递给另一个 DoFn
,它将集群扩展到最小数量。
这是我正在做的一些代码片段。
class _BigTableWriteFn(beam.DoFn):
""" Creates the connector can call and add_row to the batcher using each
row in beam pipe line
"""
def __init__(self, project_id, instance_id, table_id, cluster_id, node_count):
""" Constructor of the Write connector of Bigtable
Args:
project_id(str): GCP Project of to write the Rows
instance_id(str): GCP Instance to write the Rows
table_id(str): GCP Table to write the `DirectRows`
cluster_id(str): GCP Cluster to write the scale
node_count(int): Number of nodes to scale to before writing
"""
beam.DoFn.__init__(self)
self.beam_options = {
'project_id': project_id,
'instance_id': instance_id,
'table_id': table_id,
'cluster_id': cluster_id,
'node_count': node_count
}
self.table = None
self.current_node_count = None
self.batcher = None
self.written = Metrics.counter(self.__class__, 'Written Row')
def __getstate__(self):
return self.beam_options
def __setstate__(self, options):
self.beam_options = options
self.table = None
self.current_node_count = None
self.batcher = None
self.written = Metrics.counter(self.__class__, 'Written Row')
def setup(self):
client = Client(project=self.beam_options['project_id'].get(), admin=True)
instance = client.instance(self.beam_options['instance_id'].get())
cluster = instance.cluster(self.beam_options['cluster_id'].get())
cluster.reload()
desired_node_count = self.beam_options['node_count'].get()
self.current_node_count = cluster.serve_nodes
if desired_node_count != self.current_node_count:
cluster.serve_nodes = desired_node_count
cluster.update()
def start_bundle(self):
if self.table is None:
client = Client(project=self.beam_options['project_id'].get())
instance = client.instance(self.beam_options['instance_id'].get())
self.table = instance.table(self.beam_options['table_id'].get())
self.batcher = self.table.mutations_batcher()
def process(self, row):
self.written.inc()
# You need to set the timestamp in the cells in this row object,
# when we do a retry we will mutating the same object, but, with this
# we are going to set our cell with new values.
# Example:
# direct_row.set_cell('cf1',
# 'field1',
# 'value1',
# timestamp=datetime.datetime.now())
self.batcher.mutate(row)
# return the initial node count so we can find the minimum value and scale down BigTable latter
if self.current_node_count:
yield self.current_node_count
def finish_bundle(self):
self.batcher.flush()
self.batcher = None
class _BigTableScaleNodes(beam.DoFn):
def __init__(self, project_id, instance_id, cluster_id):
""" Constructor of the Scale connector of Bigtable
Args:
project_id(str): GCP Project of to write the Rows
instance_id(str): GCP Instance to write the Rows
cluster_id(str): GCP Cluster to write the scale
"""
beam.DoFn.__init__(self)
self.beam_options = {
'project_id': project_id,
'instance_id': instance_id,
'cluster_id': cluster_id,
}
self.cluster = None
def setup(self):
if self.cluster is None:
client = Client(project=self.beam_options['project_id'].get(), admin=True)
instance = client.instance(self.beam_options['instance_id'].get())
self.cluster = instance.cluster(self.beam_options['cluster_id'].get())
def process(self, min_node_counts):
if len(min_node_counts) > 0 and self.cluster.serve_nodes != min_node_counts[0]:
self.cluster.serve_nodes = min_node_counts[0]
self.cluster.update()
def run():
custom_options = PipelineOptions().view_as(CustomOptions)
pipeline_options = PipelineOptions()
p = beam.Pipeline(options=pipeline_options)
(p
| 'Query BigQuery' >> beam.io.Read(beam.io.BigQuerySource(query=QUERY, use_standard_sql=True))
| 'Map Query Results to BigTable Rows' >> beam.Map(to_direct_rows)
| 'Write BigTable Rows' >> beam.ParDo(_BigTableWriteFn(
custom_options.bigtable_project_id,
custom_options.bigtable_instance_id,
custom_options.bigtable_table_id,
custom_options.bigtable_cluster_id,
custom_options.bigtable_node_count))
| 'Find Global Min Node Count' >> beam.CombineGlobally(beam.combiners.Smallest(1))
| 'Scale Down BigTable' >> beam.ParDo(_BigTableScaleNodes(
custom_options.bigtable_project_id,
custom_options.bigtable_instance_id,
custom_options.bigtable_cluster_id))
)
result = p.run()
result.wait_until_finish()