并行化 for 循环并合并 pandas 个数据帧
parallelize for loop and merge pandas dataframes
我的脚本如下
import pandas as pd
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3']})
def make_df(year):
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], str(year): [str(year), str(year+1), str(year+2), str(year+3)]})
return df
for year in range(2020, 2015, -1):
df = pd.merge(df, make_df(year), on=['key'], how='left')
最终的df将是..
key A 2020 2019 2018 2017 2016
0 K0 A0 2020 2019 2018 2017 2016
1 K1 A1 2021 2020 2019 2018 2017
2 K2 A2 2022 2021 2020 2019 2018
3 K3 A3 2023 2022 2021 2020 2019
我的实际 make_new_df(year)
复杂得多,需要太多时间。
如何并行化 for 循环 for year in range(2020, 2015, -1):
并缩短处理时间?
编辑:使用 multiprocessing
而不是 threading
阅读您的评论后,您似乎想 运行 在不同进程中(并行)您的函数:
import multiprocessing
import pandas as pd
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3']})
year_start = 2020
year_stop = 2015
year_range = range(year_start, year_stop, -1)
def make_df(year):
df = pd.DataFrame({str(year): [str(year), str(year+1), str(year+2), str(year+3)]})
return df
pool = multiprocessing.Pool(year_start - year_stop)
df_list = pool.map(func=make_df, iterable=year_range)
pool.close()
pool.join()
df = df.join(df_list)
print(df)
我的脚本如下
import pandas as pd
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3']})
def make_df(year):
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], str(year): [str(year), str(year+1), str(year+2), str(year+3)]})
return df
for year in range(2020, 2015, -1):
df = pd.merge(df, make_df(year), on=['key'], how='left')
最终的df将是..
key A 2020 2019 2018 2017 2016
0 K0 A0 2020 2019 2018 2017 2016
1 K1 A1 2021 2020 2019 2018 2017
2 K2 A2 2022 2021 2020 2019 2018
3 K3 A3 2023 2022 2021 2020 2019
我的实际 make_new_df(year)
复杂得多,需要太多时间。
如何并行化 for 循环 for year in range(2020, 2015, -1):
并缩短处理时间?
编辑:使用 multiprocessing
而不是 threading
阅读您的评论后,您似乎想 运行 在不同进程中(并行)您的函数:
import multiprocessing
import pandas as pd
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3']})
year_start = 2020
year_stop = 2015
year_range = range(year_start, year_stop, -1)
def make_df(year):
df = pd.DataFrame({str(year): [str(year), str(year+1), str(year+2), str(year+3)]})
return df
pool = multiprocessing.Pool(year_start - year_stop)
df_list = pool.map(func=make_df, iterable=year_range)
pool.close()
pool.join()
df = df.join(df_list)
print(df)