根据年月随机抽样行
Randomly sample rows based on year-month
data = {'date':['2019-01-01', '2019-01-02', '2020-01-01', '2020-02-02'],
'tweets':["aaa", "bbb", "ccc", "ddd"]}
df = pandas.DataFrame(data)
df['daate'] = pandas.to_datetime(df['date'], infer_datetime_format=True)
所以我有一个对象类型日期和一个 datetime64[ns] 类型日期。我在每个年月都有 100 行的图像。如何在每个年月中随机抽取 10 行并将它们放入数据框中?谢谢!
使用DataFrame.groupby
per years and months or month periods and use custom lambda function with DataFrame.sample
:
df1 = (df.groupby([df['daate'].dt.year, df['daate'].dt.month], group_keys=False)
.apply(lambda x: x.sample(n=10)))
或:
df1 = (df.groupby(df['daate'].dt.to_period('m'), group_keys=False)
.apply(lambda x: x.sample(n=10)))
样本:
data = {'daate':pd.date_range('2019-01-01', '2020-01-22'),
'tweets':np.random.choice(["aaa", "bbb", "ccc", "ddd"], 387)
}
df = pd.DataFrame(data)
df1 = (df.groupby([df['daate'].dt.year, df['daate'].dt.month], group_keys=False)
.apply(lambda x: x.sample(n=10)))
print (df1)
date tweets daate
9 2019-01-10 bbb 2019-01-10
29 2019-01-30 ddd 2019-01-30
17 2019-01-18 ccc 2019-01-18
12 2019-01-13 ccc 2019-01-13
20 2019-01-21 ddd 2019-01-21
.. ... ... ...
381 2020-01-17 bbb 2020-01-17
375 2020-01-11 aaa 2020-01-11
373 2020-01-09 bbb 2020-01-09
368 2020-01-04 aaa 2020-01-04
382 2020-01-18 bbb 2020-01-18
[130 rows x 3 columns]
import pandas as pd
data = {"date": ["2019-01-01", "2019-01-02", "2020-01-01", "2020-02-02"], "tweets": ["aaa", "bbb", "ccc", "ddd"]}
df = pd.DataFrame(data)
df["daate"] = pd.to_datetime(df["date"], infer_datetime_format=True)
# Just duplicating row
df = df.loc[df.index.repeat(100)]
# The actual code
available_dates = df["daate"].unique()
sampled_df = pd.DataFrame()
for each_date in available_dates:
rows_with_that_date = df.loc[df["daate"] == each_date]
sampled_rows_with_that_date = rows_with_that_date.sample(5) # 5 samples
sampled_df = sampled_df.append(sampled_rows_with_that_date)
print(len(sampled_df))
data = {'date':['2019-01-01', '2019-01-02', '2020-01-01', '2020-02-02'],
'tweets':["aaa", "bbb", "ccc", "ddd"]}
df = pandas.DataFrame(data)
df['daate'] = pandas.to_datetime(df['date'], infer_datetime_format=True)
所以我有一个对象类型日期和一个 datetime64[ns] 类型日期。我在每个年月都有 100 行的图像。如何在每个年月中随机抽取 10 行并将它们放入数据框中?谢谢!
使用DataFrame.groupby
per years and months or month periods and use custom lambda function with DataFrame.sample
:
df1 = (df.groupby([df['daate'].dt.year, df['daate'].dt.month], group_keys=False)
.apply(lambda x: x.sample(n=10)))
或:
df1 = (df.groupby(df['daate'].dt.to_period('m'), group_keys=False)
.apply(lambda x: x.sample(n=10)))
样本:
data = {'daate':pd.date_range('2019-01-01', '2020-01-22'),
'tweets':np.random.choice(["aaa", "bbb", "ccc", "ddd"], 387)
}
df = pd.DataFrame(data)
df1 = (df.groupby([df['daate'].dt.year, df['daate'].dt.month], group_keys=False)
.apply(lambda x: x.sample(n=10)))
print (df1)
date tweets daate
9 2019-01-10 bbb 2019-01-10
29 2019-01-30 ddd 2019-01-30
17 2019-01-18 ccc 2019-01-18
12 2019-01-13 ccc 2019-01-13
20 2019-01-21 ddd 2019-01-21
.. ... ... ...
381 2020-01-17 bbb 2020-01-17
375 2020-01-11 aaa 2020-01-11
373 2020-01-09 bbb 2020-01-09
368 2020-01-04 aaa 2020-01-04
382 2020-01-18 bbb 2020-01-18
[130 rows x 3 columns]
import pandas as pd
data = {"date": ["2019-01-01", "2019-01-02", "2020-01-01", "2020-02-02"], "tweets": ["aaa", "bbb", "ccc", "ddd"]}
df = pd.DataFrame(data)
df["daate"] = pd.to_datetime(df["date"], infer_datetime_format=True)
# Just duplicating row
df = df.loc[df.index.repeat(100)]
# The actual code
available_dates = df["daate"].unique()
sampled_df = pd.DataFrame()
for each_date in available_dates:
rows_with_that_date = df.loc[df["daate"] == each_date]
sampled_rows_with_that_date = rows_with_that_date.sample(5) # 5 samples
sampled_df = sampled_df.append(sampled_rows_with_that_date)
print(len(sampled_df))