Python TfidfVectorizer:是否可以进行条件重新初始化?
Python TfidfVectorizer : Is conditional re-initialization possible?
我正在尝试有条件地重新初始化一个对象
假设我有以下初始化
TfidfVectorizer(sublinear_tf=True , decode_error='ignore', analyzer='word', tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'))
现在,我从用户那里得到了一个他想要添加的一些参数的指令
d = {"stop_words":"english"}
如何将字典参数添加到已经初始化的对象中?所以对象的最终版本相当于
TfidfVectorizer(
stop_words='english',
sublinear_tf=True ,
decode_error='ignore',
analyzer='word',
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'))
我可以吗
TfidfVectorizer(**d)
这会保留之前初始化的参数吗?我想在 TfidfVectorizer 中有一些默认设置,然后我希望用户能够选择其余设置。
这样的事情可能吗?
似乎可以使用 set_params()
,从 set_params()
和 get_params()
的这个小实验来看:
from sklearn.feature_extraction.text import TfidfVectorizer
t = TfidfVectorizer()
t.get_params()
Out[23]:
{'analyzer': u'word',
'binary': False,
'charset': None,
'charset_error': None,
'decode_error': u'strict',
'dtype': numpy.int64,
'encoding': u'utf-8',
'input': u'content',
'lowercase': True,
'max_df': 1.0,
'max_features': None,
'min_df': 1,
'ngram_range': (1, 1),
'norm': u'l2',
'preprocessor': None,
'smooth_idf': True,
'stop_words': None,
'strip_accents': None,
'sublinear_tf': False,
'token_pattern': u'(?u)\b\w\w+\b',
'tokenizer': None,
'use_idf': True,
'vocabulary': None}
t.set_params(binary=True)
Out[24]:
TfidfVectorizer(analyzer=u'word', binary=True, charset=None,
charset_error=None, decode_error=u'strict',
dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
stop_words=None, strip_accents=None, sublinear_tf=False,
token_pattern=u'(?u)\b\w\w+\b', tokenizer=None, use_idf=True,
vocabulary=None)
t.set_params(smooth_idf=False)
Out[25]:
TfidfVectorizer(analyzer=u'word', binary=True, charset=None,
charset_error=None, decode_error=u'strict',
dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm=u'l2', preprocessor=None,
smooth_idf=False, stop_words=None, strip_accents=None,
sublinear_tf=False, token_pattern=u'(?u)\b\w\w+\b',
tokenizer=None, use_idf=True, vocabulary=None)
d = {"stop_words":"english"}
t.set_params(**d)
Out[27]:
TfidfVectorizer(analyzer=u'word', binary=True, charset=None,
charset_error=None, decode_error=u'strict',
dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm=u'l2', preprocessor=None,
smooth_idf=False, stop_words='english', strip_accents=None,
sublinear_tf=False, token_pattern=u'(?u)\b\w\w+\b',
tokenizer=None, use_idf=True, vocabulary=None)
此外,source 显示 .set_params()
遍历您提供给它的参数,其余的不受影响:
def set_params(self, **params):
"""Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The former have parameters of the form
``<component>__<parameter>`` so that it's possible to update each
component of a nested object.
Returns
-------
self
"""
if not params:
# Simple optimisation to gain speed (inspect is slow)
return self
valid_params = self.get_params(deep=True)
for key, value in six.iteritems(params):
split = key.split('__', 1)
if len(split) > 1:
# nested objects case
name, sub_name = split
if name not in valid_params:
raise ValueError('Invalid parameter %s for estimator %s. '
'Check the list of available parameters '
'with `estimator.get_params().keys()`.' %
(name, self))
sub_object = valid_params[name]
sub_object.set_params(**{sub_name: value})
else:
# simple objects case
if key not in valid_params:
raise ValueError('Invalid parameter %s for estimator %s. '
'Check the list of available parameters '
'with `estimator.get_params().keys()`.' %
(key, self.__class__.__name__))
setattr(self, key, value)
return self
我正在尝试有条件地重新初始化一个对象
假设我有以下初始化
TfidfVectorizer(sublinear_tf=True , decode_error='ignore', analyzer='word', tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'))
现在,我从用户那里得到了一个他想要添加的一些参数的指令
d = {"stop_words":"english"}
如何将字典参数添加到已经初始化的对象中?所以对象的最终版本相当于
TfidfVectorizer(
stop_words='english',
sublinear_tf=True ,
decode_error='ignore',
analyzer='word',
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'))
我可以吗
TfidfVectorizer(**d)
这会保留之前初始化的参数吗?我想在 TfidfVectorizer 中有一些默认设置,然后我希望用户能够选择其余设置。
这样的事情可能吗?
似乎可以使用 set_params()
,从 set_params()
和 get_params()
的这个小实验来看:
from sklearn.feature_extraction.text import TfidfVectorizer
t = TfidfVectorizer()
t.get_params()
Out[23]:
{'analyzer': u'word',
'binary': False,
'charset': None,
'charset_error': None,
'decode_error': u'strict',
'dtype': numpy.int64,
'encoding': u'utf-8',
'input': u'content',
'lowercase': True,
'max_df': 1.0,
'max_features': None,
'min_df': 1,
'ngram_range': (1, 1),
'norm': u'l2',
'preprocessor': None,
'smooth_idf': True,
'stop_words': None,
'strip_accents': None,
'sublinear_tf': False,
'token_pattern': u'(?u)\b\w\w+\b',
'tokenizer': None,
'use_idf': True,
'vocabulary': None}
t.set_params(binary=True)
Out[24]:
TfidfVectorizer(analyzer=u'word', binary=True, charset=None,
charset_error=None, decode_error=u'strict',
dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
stop_words=None, strip_accents=None, sublinear_tf=False,
token_pattern=u'(?u)\b\w\w+\b', tokenizer=None, use_idf=True,
vocabulary=None)
t.set_params(smooth_idf=False)
Out[25]:
TfidfVectorizer(analyzer=u'word', binary=True, charset=None,
charset_error=None, decode_error=u'strict',
dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm=u'l2', preprocessor=None,
smooth_idf=False, stop_words=None, strip_accents=None,
sublinear_tf=False, token_pattern=u'(?u)\b\w\w+\b',
tokenizer=None, use_idf=True, vocabulary=None)
d = {"stop_words":"english"}
t.set_params(**d)
Out[27]:
TfidfVectorizer(analyzer=u'word', binary=True, charset=None,
charset_error=None, decode_error=u'strict',
dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm=u'l2', preprocessor=None,
smooth_idf=False, stop_words='english', strip_accents=None,
sublinear_tf=False, token_pattern=u'(?u)\b\w\w+\b',
tokenizer=None, use_idf=True, vocabulary=None)
此外,source 显示 .set_params()
遍历您提供给它的参数,其余的不受影响:
def set_params(self, **params):
"""Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The former have parameters of the form
``<component>__<parameter>`` so that it's possible to update each
component of a nested object.
Returns
-------
self
"""
if not params:
# Simple optimisation to gain speed (inspect is slow)
return self
valid_params = self.get_params(deep=True)
for key, value in six.iteritems(params):
split = key.split('__', 1)
if len(split) > 1:
# nested objects case
name, sub_name = split
if name not in valid_params:
raise ValueError('Invalid parameter %s for estimator %s. '
'Check the list of available parameters '
'with `estimator.get_params().keys()`.' %
(name, self))
sub_object = valid_params[name]
sub_object.set_params(**{sub_name: value})
else:
# simple objects case
if key not in valid_params:
raise ValueError('Invalid parameter %s for estimator %s. '
'Check the list of available parameters '
'with `estimator.get_params().keys()`.' %
(key, self.__class__.__name__))
setattr(self, key, value)
return self