如何在假设数据框中实现依赖列
How to implement dependant columns in hypothesis dataframes
我正在使用假设数据框来实现一个数据框,其中 start_time 和 end_time 是两列。这是一个大块头:
import hypothesis.strategies as st
import logging
import datetime
from hypothesis import given
from hypothesis.extra.pandas import column, data_frames, range_indexes
current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
datetime_st = st.integers(
min_value=(current_time + datetime.timedelta(hours=4)).timestamp(),
max_value=(current_time + datetime.timedelta(hours=20)).timestamp(),
)
df_columns = {
# other fields omitted
"start_time": {"elements": datetime_st, "unique": False},
"end_time": {"elements": datetime_st, "unique": False},
}
test_dfs = data_frames(
index=range_indexes(min_size=20, max_size=100),
columns=[column(key, **value) for key, value in df_columns.items()],
)
@given(df=test_dfs)
def test_hyothesis(df):
logging.info(df)
assert 1
我无法找到一个解决方案来断言每个 start_time 应该比其对应的 end_time 至少大 delta。我试过 composite
,但我不确定如何在 dataframes
.
的每一行上实现它
有没有办法在初始化 start_time 和 end_time 时将增量作为规则执行?
这是一种生成两个时间戳列的数据帧的方法,其中第一个和第二个之间的差异至少为 3600 秒(或其他时间量)。我为此使用 st.flatmap
。
import hypothesis.strategies as st
from hypothesis.extra.pandas import column, data_frames, range_indexes, columns
current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
MIN_DIFF_SECONDS = 3600
two_timestamps_with_diff = st.integers(
min_value = current_time + 3600 * 4,
max_value = current_time + 4600 * 20).flatmap(
lambda n: st.tuples(
st.integers(min_value = n, max_value=n),
st.integers(min_value = n + MIN_DIFF_SECONDS, max_value = n + 3600*10)
))
# sample code to examine the results of this strategy
# for _ in range(10):
# x, y = two_timestamps_with_diff.example()
# print(x, y, y-x)
test_dfs = data_frames(
index=range_indexes(min_size=20, max_size=100),
columns=columns(["start_time", "end_time"], dtype=int),
rows=two_timestamps_with_diff,
)
# sample code to examine the results of this strategy
# res = test_dfs.example()
# res.assign(d = res.end_time - res.start_time)
# a test with an assertion that validates this constraint.
@given(df=test_dfs)
def test_hyothesis(df):
logging.info(df)
assert ((df.end_time - df.start_time) >= MIN_DIFF_SECONDS).all()
# run the test. It passes.
test_hyothesis()
如果您想向自动生成的数据框中添加额外的列,请执行以下操作(在此示例中,新列为 'a' 和 'b'):
from hypothesis.strategies import composite
@composite
def test_df_with_additional_columns(draw, elements=test_dfs):
df = draw(test_dfs)
class GetIndex(st.SearchStrategy[pd.core.indexes.range.RangeIndex]):
def do_draw(self, _):
return df.index
more_col_strategy = data_frames([column('A', dtype=int),
column('B', dtype=float)],
index = GetIndex()
)
more_cols = draw(more_col_strategy)
return pd.concat([df, more_cols], axis=1)
test_df_with_additional_columns().example()
我正在使用假设数据框来实现一个数据框,其中 start_time 和 end_time 是两列。这是一个大块头:
import hypothesis.strategies as st
import logging
import datetime
from hypothesis import given
from hypothesis.extra.pandas import column, data_frames, range_indexes
current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
datetime_st = st.integers(
min_value=(current_time + datetime.timedelta(hours=4)).timestamp(),
max_value=(current_time + datetime.timedelta(hours=20)).timestamp(),
)
df_columns = {
# other fields omitted
"start_time": {"elements": datetime_st, "unique": False},
"end_time": {"elements": datetime_st, "unique": False},
}
test_dfs = data_frames(
index=range_indexes(min_size=20, max_size=100),
columns=[column(key, **value) for key, value in df_columns.items()],
)
@given(df=test_dfs)
def test_hyothesis(df):
logging.info(df)
assert 1
我无法找到一个解决方案来断言每个 start_time 应该比其对应的 end_time 至少大 delta。我试过 composite
,但我不确定如何在 dataframes
.
有没有办法在初始化 start_time 和 end_time 时将增量作为规则执行?
这是一种生成两个时间戳列的数据帧的方法,其中第一个和第二个之间的差异至少为 3600 秒(或其他时间量)。我为此使用 st.flatmap
。
import hypothesis.strategies as st
from hypothesis.extra.pandas import column, data_frames, range_indexes, columns
current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
MIN_DIFF_SECONDS = 3600
two_timestamps_with_diff = st.integers(
min_value = current_time + 3600 * 4,
max_value = current_time + 4600 * 20).flatmap(
lambda n: st.tuples(
st.integers(min_value = n, max_value=n),
st.integers(min_value = n + MIN_DIFF_SECONDS, max_value = n + 3600*10)
))
# sample code to examine the results of this strategy
# for _ in range(10):
# x, y = two_timestamps_with_diff.example()
# print(x, y, y-x)
test_dfs = data_frames(
index=range_indexes(min_size=20, max_size=100),
columns=columns(["start_time", "end_time"], dtype=int),
rows=two_timestamps_with_diff,
)
# sample code to examine the results of this strategy
# res = test_dfs.example()
# res.assign(d = res.end_time - res.start_time)
# a test with an assertion that validates this constraint.
@given(df=test_dfs)
def test_hyothesis(df):
logging.info(df)
assert ((df.end_time - df.start_time) >= MIN_DIFF_SECONDS).all()
# run the test. It passes.
test_hyothesis()
如果您想向自动生成的数据框中添加额外的列,请执行以下操作(在此示例中,新列为 'a' 和 'b'):
from hypothesis.strategies import composite
@composite
def test_df_with_additional_columns(draw, elements=test_dfs):
df = draw(test_dfs)
class GetIndex(st.SearchStrategy[pd.core.indexes.range.RangeIndex]):
def do_draw(self, _):
return df.index
more_col_strategy = data_frames([column('A', dtype=int),
column('B', dtype=float)],
index = GetIndex()
)
more_cols = draw(more_col_strategy)
return pd.concat([df, more_cols], axis=1)
test_df_with_additional_columns().example()