重构 python 函数的最佳方法
Best approach to refactoring a python function
我有一个杂乱的函数,我正在努力重构以使其更高效和可读。我的 python 技能充其量只是初级到中级水平,我想有一种更简洁的方法可以完成这项任务。
下面的函数接受一个字符串,其中包含各种与业务联系相关的信息。信息以冒号分隔。公司名称始终是第一个字段,因此可以轻松提取,但其余“列(冒号之间的数据)可能包含也可能不包含,并且顺序并不总是相同。
该函数有两个参数,1) 行数据(包含以下示例的字符串)和 2) 我希望返回的数据元素。
# Business Contact Information
def parseBusinessContactInformation(self,rowdata,element):
## Process Business Contact Information
## example rowdata = "Business Name, LLC : Business DBA : Email- person@email.com : Phone- 1234567890 : Website- www.site.com"
## example rowdata = "Business Name, LLC : Email- person@email.com : Phone- 1234567890 : Website- www.site.com"
## example rowdata = "Business Name, LLC : Business DBA : Phone- 1234567890 : Website- www.site.com"
## example rowdata = "Business Name, LLC : Phone- 1234567890"
businessName = None
businessDba = None
businessPhone = None
businessEmail = None
businessWebsite = None
# Split rowdata on :
contactData = rowdata.split(':')
## [0] - business name should always be present
businessName = contactData[0].strip()
## [1] - doing_business_as or another field if not present
if 1 < len(contactData) and re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif 1 < len(contactData) and re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif 1 < len(contactData) and re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif 1 < len(contactData) and not re.search(r'(phone|email|website)',contactData[1].lower()):
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
## [2] - phone or email or website
if 2 < len(contactData) and re.search('email',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessEmail = contactTemp[1].strip()
elif 2 < len(contactData) and re.search('phone',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessPhone = contactTemp[1].strip()
elif 2 < len(contactData) and re.search('website',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessWebsite = contactTemp[1].strip()
## [3] - phone or email or website
if 3 < len(contactData) and re.search('email',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessEmail = contactTemp[1].strip()
elif 3 < len(contactData) and re.search('phone',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessPhone = contactTemp[1].strip()
elif 3 < len(contactData) and re.search('website',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessWebsite = contactTemp[1].strip()
if element == "businessName":
return businessName
elif element == "businessDba":
return businessDba
elif element == "businessPhone":
return businessPhone
elif element == "businessEmail":
return businessEmail
elif element == "businessWebsite":
return businessWebsite
else:
return self.dataNotAvailableMessage
我正在尝试了解更好的方法。
重构是一个累积的过程。 Martin Fowler 和 Kent Beck 在 Refactoring 中对该方法进行了全面的描述。
Its heart is a series of small behavior preserving transformations. (Martin Fowler, https://refactoring.com/)
最重要的部分是:“小”和“行为保持”。 “小”这个词是不言自明的,但是“行为保持”应该通过单元测试来保证。
初步评论:我建议您坚持使用 PEP 8 Style Guide。
行为保留
用文档字符串替换您的评论 (https://www.python.org/dev/peps/pep-0008/#id33). This is very useful because you write some unit tests inside the docstring (a.k.a. doctests)。
class MyParser:
dataNotAvailableMessage = "dataNotAvailableMessage"
# Business Contact Information
def parseBusinessContactInformation(self,rowdata,element):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessPhone")
'1234567890'
>>> p.parseBusinessContactInformation("Business Name, LLC : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessName")
'Business Name, LLC'
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Phone- 1234567890 : Website- www.site.com", "businessDba")
'Business DBA'
>>> p.parseBusinessContactInformation("Business Name, LLC : Phone- 1234567890", "businessEmail") is None
True
>>> p.parseBusinessContactInformation("Business Name, LLC : Phone- 1234567890", "?")
'dataNotAvailableMessage'
"""
...
import doctest
doctest.testmod()
您应该编写更多的单元测试(使用 https://docs.python.org/3/library/unittest.html 以避免文档字符串泛滥)来确保当前的行为,但这是一个好的开始。
现在,一个小t运行信息:看看那些(el)if 1 < len(contactData) and ...
行。您可以只测试一次长度:
if 1 < len(contactData):
if re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif not re.search(r'(phone|email|website)',contactData[1].lower()):
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
else:
businessDba = self.dataNotAvailableMessage
您注意到倒数第二个 else
无法访问:您有 phone
、email
、website
或没有:
if 1 < len(contactData):
if re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
businessDba = contactData[0].strip()
else:
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
对 [2] 和 [3] 执行相同的操作:
if 2 < len(contactData):
if re.search('email',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessWebsite = contactTemp[1].strip()
if 3 < len(contactData):
if re.search('email',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessWebsite = contactTemp[1].strip()
现在您看到了一个清晰的模式。除了第一部分作业 businessDba
,您显然有 3 次相同的过程。首先,我们在第一部分中分离出 businessDba
的赋值:
if 1 < len(contactData):
if re.search('(email|phone|website)',contactData[1].lower()):
businessDba = contactData[0].strip()
else:
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
然后:
if 1 < len(contactData):
if re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
在我们继续之前,我们可以删除行
businessName = None
businessDba = None
因为 businessName
和 businessDba
总是有一个值。并替换新行:
businessDba = contactData[0].strip()
作者:
businessDba = businessName
这明确了回退。
现在,我们有三次相同的过程。循环是个好主意:
for i in range(1, 3):
if i >= len(contactData):
break
if re.search('email',contactData[i].lower()):
contactTemp = contactData[i].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[i].lower()):
contactTemp = contactData[i].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[i].lower()):
contactTemp = contactData[i].split('-')
businessWebsite = contactTemp[1].strip()
我们可以提取contactTemp =
,即使它并不总是有用的:
for i in range(1, 3):
if i >= len(contactData):
break
contactTemp = contactData[i].split('-')
if re.search('email',contactData[i].lower()):
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[i].lower()):
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[i].lower()):
businessWebsite = contactTemp[1].strip()
更好,但我发现最后一部分 (if element == ...
) 真的很麻烦:您要针对所有可能性测试 element
。这里有人想要一本字典。对于一个小的 t运行sformation,我们可以这样写:
d = {
"businessName": businessName,
"businessDba": businessDba,
"businessPhone": businessPhone,
"businessEmail": businessEmail,
"businessWebsite": businessWebsite
}
return d.get(element, self.dataNotAvailableMessage)
现在,我们可以创建它并即时更新它,而不是在最后初始化字典:
d = {
"businessPhone": None,
"businessEmail": None,
"businessWebsite": None
}
# Split rowdata on :
contactData = rowdata.split(':')
## [0] - business name should always be present
d["businessName"] = contactData[0].strip()
if 1 < len(contactData):
if re.search('(email|phone|website)',contactData[1].lower()):
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = contactData[1].strip()
else:
d["businessDba"] = self.dataNotAvailableMessage
for i in range(1, 4):
if i >= len(contactData):
break
contactTemp = contactData[i].split('-')
if re.search('email',contactData[i].lower()):
d["businessEmail"] = contactTemp[1].strip()
elif re.search('phone',contactData[i].lower()):
d["businessPhone" = contactTemp[1].strip()
elif re.search('website',contactData[i].lower()):
d["businessWebsite"] = contactTemp[1].strip()
return d.get(element, self.dataNotAvailableMessage)
我运行测试了每一个修改,它仍然有效,但它不是那么容易阅读。我们可以提取一个创建字典的函数:
def parseBusinessContactInformation(self, rowdata, element):
d = self._parseBusinessContactInformation(rowdata)
return d.get(element, self.dataNotAvailableMessage)
def _parseBusinessContactInformation(self, rowdata):
...
行为略有改变
这还不错,但我们可以通过 小的行为更改 来改善这一点(我希望你能接受这个新行为!):
for i in range(1, 4):
if i >= len(contactData):
break
contactTemp = contactData[i].split('-')
if len(contactTemp) > 1:
d["business" + contactTemp[0].strip()] = contactTemp[1].strip()
行为改变是什么?简单地说,我们现在接受类似
的东西
>>> p = MyParser()
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Foo- Bar", "businessFoo")
'Bar'
既然我们接受了更多的element
s,我们应该改变循环range
:
for i in range(1, len(contactData)):
...
是时候关注一个轻微的不一致了:为什么 businessDba
可以具有为不存在的元素创建的值 self.dataNotAvailableMessage
?我们应该使用 None
:
d = {
"businessDba": None,
...
}
并删除这两行:
else:
d["businessDba"] = self.dataNotAvailableMessage
那么可以简化为:
if 1 < len(contactData):
if "-" in contactData[1]:
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = contactData[1].strip()
代码如下:
def parseBusinessContactInformation(self,rowdata,element):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessPhone")
'1234567890'
>>> p.parseBusinessContactInformation("Business Name, LLC : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessName")
'Business Name, LLC'
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Phone- 1234567890 : Website- www.site.com", "businessDba")
'Business DBA'
>>> p.parseBusinessContactInformation("Business Name, LLC : Phone- 1234567890", "businessEmail") is None
True
>>> p.parseBusinessContactInformation("Business Name, LLC : Phone- 1234567890", "?")
'dataNotAvailableMessage'
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Foo- Bar", "businessFoo")
'Bar'
"""
d = self._parseBusinessContactInformation(rowdata)
return d.get(element, self.dataNotAvailableMessage)
def _parseBusinessContactInformation(self,rowdata):
d = {
"businessDba": None,
"businessPhone": None,
"businessEmail": None,
"businessWebsite": None
}
# Split rowdata on :
contactData = rowdata.split(':')
## [0] - business name should always be present
d["businessName"] = contactData[0].strip()
if 1 < len(contactData):
if "-" in contactData[1]:
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = contactData[1].strip()
for i in range(1, len(contactData)):
contactTemp = contactData[i].split('-')
if len(contactTemp) > 1:
d["business" + contactTemp[0].strip()] = contactTemp[1].strip()
return d
最后一点:切换到 snake case,创建一个 get
和一个 parse
函数:parse
returns a dict while get
returns一个值:
data_not_available_message = "dataNotAvailableMessage"
def get_business_contact_information(self, rowdata, element):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.get_business_contact_information("Business Name, LLC : Business DBA : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessPhone")
'1234567890'
>>> p.get_business_contact_information("Business Name, LLC : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessName")
'Business Name, LLC'
>>> p.get_business_contact_information("Business Name, LLC : Business DBA : Phone- 1234567890 : Website- www.site.com", "businessDba")
'Business DBA'
>>> p.get_business_contact_information("Business Name, LLC : Phone- 1234567890", "businessEmail") is None
True
>>> p.get_business_contact_information("Business Name, LLC : Phone- 1234567890", "?")
'dataNotAvailableMessage'
>>> p.get_business_contact_information("Business Name, LLC : Business DBA : Foo- Bar", "businessFoo")
'Bar'
:param rowdata: ...
:param element: ...
:return: ...
"""
d = self._parse_business_contact_information(rowdata)
return d.get(element, self.data_not_available_message)
进行一些外观上的更改以使其更符合 Python 风格:
def parse_business_contact_information(self, rowdata):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.parse_business_contact_information("Business Name, LLC : Business DBA : Email- person@email.com : Phone- 1234567890 : Website- www.site.com") == {
... 'businessDba': 'Business DBA', 'businessPhone': '1234567890', 'businessEmail': 'person@email.com',
... 'businessWebsite': 'www.site.com', 'businessName': 'Business Name, LLC'}
True
>>> p.parse_business_contact_information("Business Name, LLC : Phone- 1234567890") == {
... 'businessDba': 'Business Name, LLC', 'businessPhone': '1234567890', 'businessEmail': None,
... 'businessWebsite': None, 'businessName': 'Business Name, LLC'}
True
:param rowdata: ...
:return: ...
"""
d = dict.fromkeys(("businessDba", "businessPhone",
"businessEmail", "businessWebsite"))
name, *others = rowdata.split(':') # destructuring assignment
d["businessName"] = name.strip()
if not others:
return d
if "-" in others[0]:
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = others[0].strip()
others.pop(0) # consume others[0]
for data in others:
try:
key, value = data.split('-', 1) # a- b-c => a, b-c
except ValueError: # too many/not enough values to unpack
print("Element {} should have a dash".format(data))
else:
d["business" + key.strip()] = value.strip()
return d
代码并不完美,但比以前更清晰了,至少在我看来是这样。
方法总结:
- 编写单元测试以保护行为;
- 制作保留行为的小t运行形式和提高可读性。分解你可以和不关注性能的因素;
- 继续,直到你有清楚的东西/当你绕圈子做不必要的修改时停下来;
- 如有必要,提高性能。
我有一个杂乱的函数,我正在努力重构以使其更高效和可读。我的 python 技能充其量只是初级到中级水平,我想有一种更简洁的方法可以完成这项任务。
下面的函数接受一个字符串,其中包含各种与业务联系相关的信息。信息以冒号分隔。公司名称始终是第一个字段,因此可以轻松提取,但其余“列(冒号之间的数据)可能包含也可能不包含,并且顺序并不总是相同。
该函数有两个参数,1) 行数据(包含以下示例的字符串)和 2) 我希望返回的数据元素。
# Business Contact Information
def parseBusinessContactInformation(self,rowdata,element):
## Process Business Contact Information
## example rowdata = "Business Name, LLC : Business DBA : Email- person@email.com : Phone- 1234567890 : Website- www.site.com"
## example rowdata = "Business Name, LLC : Email- person@email.com : Phone- 1234567890 : Website- www.site.com"
## example rowdata = "Business Name, LLC : Business DBA : Phone- 1234567890 : Website- www.site.com"
## example rowdata = "Business Name, LLC : Phone- 1234567890"
businessName = None
businessDba = None
businessPhone = None
businessEmail = None
businessWebsite = None
# Split rowdata on :
contactData = rowdata.split(':')
## [0] - business name should always be present
businessName = contactData[0].strip()
## [1] - doing_business_as or another field if not present
if 1 < len(contactData) and re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif 1 < len(contactData) and re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif 1 < len(contactData) and re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif 1 < len(contactData) and not re.search(r'(phone|email|website)',contactData[1].lower()):
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
## [2] - phone or email or website
if 2 < len(contactData) and re.search('email',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessEmail = contactTemp[1].strip()
elif 2 < len(contactData) and re.search('phone',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessPhone = contactTemp[1].strip()
elif 2 < len(contactData) and re.search('website',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessWebsite = contactTemp[1].strip()
## [3] - phone or email or website
if 3 < len(contactData) and re.search('email',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessEmail = contactTemp[1].strip()
elif 3 < len(contactData) and re.search('phone',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessPhone = contactTemp[1].strip()
elif 3 < len(contactData) and re.search('website',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessWebsite = contactTemp[1].strip()
if element == "businessName":
return businessName
elif element == "businessDba":
return businessDba
elif element == "businessPhone":
return businessPhone
elif element == "businessEmail":
return businessEmail
elif element == "businessWebsite":
return businessWebsite
else:
return self.dataNotAvailableMessage
我正在尝试了解更好的方法。
重构是一个累积的过程。 Martin Fowler 和 Kent Beck 在 Refactoring 中对该方法进行了全面的描述。
Its heart is a series of small behavior preserving transformations. (Martin Fowler, https://refactoring.com/)
最重要的部分是:“小”和“行为保持”。 “小”这个词是不言自明的,但是“行为保持”应该通过单元测试来保证。
初步评论:我建议您坚持使用 PEP 8 Style Guide。
行为保留
用文档字符串替换您的评论 (https://www.python.org/dev/peps/pep-0008/#id33). This is very useful because you write some unit tests inside the docstring (a.k.a. doctests)。
class MyParser:
dataNotAvailableMessage = "dataNotAvailableMessage"
# Business Contact Information
def parseBusinessContactInformation(self,rowdata,element):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessPhone")
'1234567890'
>>> p.parseBusinessContactInformation("Business Name, LLC : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessName")
'Business Name, LLC'
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Phone- 1234567890 : Website- www.site.com", "businessDba")
'Business DBA'
>>> p.parseBusinessContactInformation("Business Name, LLC : Phone- 1234567890", "businessEmail") is None
True
>>> p.parseBusinessContactInformation("Business Name, LLC : Phone- 1234567890", "?")
'dataNotAvailableMessage'
"""
...
import doctest
doctest.testmod()
您应该编写更多的单元测试(使用 https://docs.python.org/3/library/unittest.html 以避免文档字符串泛滥)来确保当前的行为,但这是一个好的开始。
现在,一个小t运行信息:看看那些(el)if 1 < len(contactData) and ...
行。您可以只测试一次长度:
if 1 < len(contactData):
if re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif not re.search(r'(phone|email|website)',contactData[1].lower()):
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
else:
businessDba = self.dataNotAvailableMessage
您注意到倒数第二个 else
无法访问:您有 phone
、email
、website
或没有:
if 1 < len(contactData):
if re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
businessDba = contactData[0].strip()
elif re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
businessDba = contactData[0].strip()
else:
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
对 [2] 和 [3] 执行相同的操作:
if 2 < len(contactData):
if re.search('email',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[2].lower()):
contactTemp = contactData[2].split('-')
businessWebsite = contactTemp[1].strip()
if 3 < len(contactData):
if re.search('email',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[3].lower()):
contactTemp = contactData[3].split('-')
businessWebsite = contactTemp[1].strip()
现在您看到了一个清晰的模式。除了第一部分作业 businessDba
,您显然有 3 次相同的过程。首先,我们在第一部分中分离出 businessDba
的赋值:
if 1 < len(contactData):
if re.search('(email|phone|website)',contactData[1].lower()):
businessDba = contactData[0].strip()
else:
businessDba = contactData[1].strip()
else:
businessDba = self.dataNotAvailableMessage
然后:
if 1 < len(contactData):
if re.search('email',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[1].lower()):
contactTemp = contactData[1].split('-')
businessWebsite = contactTemp[1].strip()
在我们继续之前,我们可以删除行
businessName = None
businessDba = None
因为 businessName
和 businessDba
总是有一个值。并替换新行:
businessDba = contactData[0].strip()
作者:
businessDba = businessName
这明确了回退。
现在,我们有三次相同的过程。循环是个好主意:
for i in range(1, 3):
if i >= len(contactData):
break
if re.search('email',contactData[i].lower()):
contactTemp = contactData[i].split('-')
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[i].lower()):
contactTemp = contactData[i].split('-')
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[i].lower()):
contactTemp = contactData[i].split('-')
businessWebsite = contactTemp[1].strip()
我们可以提取contactTemp =
,即使它并不总是有用的:
for i in range(1, 3):
if i >= len(contactData):
break
contactTemp = contactData[i].split('-')
if re.search('email',contactData[i].lower()):
businessEmail = contactTemp[1].strip()
elif re.search('phone',contactData[i].lower()):
businessPhone = contactTemp[1].strip()
elif re.search('website',contactData[i].lower()):
businessWebsite = contactTemp[1].strip()
更好,但我发现最后一部分 (if element == ...
) 真的很麻烦:您要针对所有可能性测试 element
。这里有人想要一本字典。对于一个小的 t运行sformation,我们可以这样写:
d = {
"businessName": businessName,
"businessDba": businessDba,
"businessPhone": businessPhone,
"businessEmail": businessEmail,
"businessWebsite": businessWebsite
}
return d.get(element, self.dataNotAvailableMessage)
现在,我们可以创建它并即时更新它,而不是在最后初始化字典:
d = {
"businessPhone": None,
"businessEmail": None,
"businessWebsite": None
}
# Split rowdata on :
contactData = rowdata.split(':')
## [0] - business name should always be present
d["businessName"] = contactData[0].strip()
if 1 < len(contactData):
if re.search('(email|phone|website)',contactData[1].lower()):
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = contactData[1].strip()
else:
d["businessDba"] = self.dataNotAvailableMessage
for i in range(1, 4):
if i >= len(contactData):
break
contactTemp = contactData[i].split('-')
if re.search('email',contactData[i].lower()):
d["businessEmail"] = contactTemp[1].strip()
elif re.search('phone',contactData[i].lower()):
d["businessPhone" = contactTemp[1].strip()
elif re.search('website',contactData[i].lower()):
d["businessWebsite"] = contactTemp[1].strip()
return d.get(element, self.dataNotAvailableMessage)
我运行测试了每一个修改,它仍然有效,但它不是那么容易阅读。我们可以提取一个创建字典的函数:
def parseBusinessContactInformation(self, rowdata, element):
d = self._parseBusinessContactInformation(rowdata)
return d.get(element, self.dataNotAvailableMessage)
def _parseBusinessContactInformation(self, rowdata):
...
行为略有改变
这还不错,但我们可以通过 小的行为更改 来改善这一点(我希望你能接受这个新行为!):
for i in range(1, 4):
if i >= len(contactData):
break
contactTemp = contactData[i].split('-')
if len(contactTemp) > 1:
d["business" + contactTemp[0].strip()] = contactTemp[1].strip()
行为改变是什么?简单地说,我们现在接受类似
的东西>>> p = MyParser()
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Foo- Bar", "businessFoo")
'Bar'
既然我们接受了更多的element
s,我们应该改变循环range
:
for i in range(1, len(contactData)):
...
是时候关注一个轻微的不一致了:为什么 businessDba
可以具有为不存在的元素创建的值 self.dataNotAvailableMessage
?我们应该使用 None
:
d = {
"businessDba": None,
...
}
并删除这两行:
else:
d["businessDba"] = self.dataNotAvailableMessage
那么可以简化为:
if 1 < len(contactData):
if "-" in contactData[1]:
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = contactData[1].strip()
代码如下:
def parseBusinessContactInformation(self,rowdata,element):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessPhone")
'1234567890'
>>> p.parseBusinessContactInformation("Business Name, LLC : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessName")
'Business Name, LLC'
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Phone- 1234567890 : Website- www.site.com", "businessDba")
'Business DBA'
>>> p.parseBusinessContactInformation("Business Name, LLC : Phone- 1234567890", "businessEmail") is None
True
>>> p.parseBusinessContactInformation("Business Name, LLC : Phone- 1234567890", "?")
'dataNotAvailableMessage'
>>> p.parseBusinessContactInformation("Business Name, LLC : Business DBA : Foo- Bar", "businessFoo")
'Bar'
"""
d = self._parseBusinessContactInformation(rowdata)
return d.get(element, self.dataNotAvailableMessage)
def _parseBusinessContactInformation(self,rowdata):
d = {
"businessDba": None,
"businessPhone": None,
"businessEmail": None,
"businessWebsite": None
}
# Split rowdata on :
contactData = rowdata.split(':')
## [0] - business name should always be present
d["businessName"] = contactData[0].strip()
if 1 < len(contactData):
if "-" in contactData[1]:
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = contactData[1].strip()
for i in range(1, len(contactData)):
contactTemp = contactData[i].split('-')
if len(contactTemp) > 1:
d["business" + contactTemp[0].strip()] = contactTemp[1].strip()
return d
最后一点:切换到 snake case,创建一个 get
和一个 parse
函数:parse
returns a dict while get
returns一个值:
data_not_available_message = "dataNotAvailableMessage"
def get_business_contact_information(self, rowdata, element):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.get_business_contact_information("Business Name, LLC : Business DBA : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessPhone")
'1234567890'
>>> p.get_business_contact_information("Business Name, LLC : Email- person@email.com : Phone- 1234567890 : Website- www.site.com", "businessName")
'Business Name, LLC'
>>> p.get_business_contact_information("Business Name, LLC : Business DBA : Phone- 1234567890 : Website- www.site.com", "businessDba")
'Business DBA'
>>> p.get_business_contact_information("Business Name, LLC : Phone- 1234567890", "businessEmail") is None
True
>>> p.get_business_contact_information("Business Name, LLC : Phone- 1234567890", "?")
'dataNotAvailableMessage'
>>> p.get_business_contact_information("Business Name, LLC : Business DBA : Foo- Bar", "businessFoo")
'Bar'
:param rowdata: ...
:param element: ...
:return: ...
"""
d = self._parse_business_contact_information(rowdata)
return d.get(element, self.data_not_available_message)
进行一些外观上的更改以使其更符合 Python 风格:
def parse_business_contact_information(self, rowdata):
"""Process Business Contact Information
Examples:
>>> p = MyParser()
>>> p.parse_business_contact_information("Business Name, LLC : Business DBA : Email- person@email.com : Phone- 1234567890 : Website- www.site.com") == {
... 'businessDba': 'Business DBA', 'businessPhone': '1234567890', 'businessEmail': 'person@email.com',
... 'businessWebsite': 'www.site.com', 'businessName': 'Business Name, LLC'}
True
>>> p.parse_business_contact_information("Business Name, LLC : Phone- 1234567890") == {
... 'businessDba': 'Business Name, LLC', 'businessPhone': '1234567890', 'businessEmail': None,
... 'businessWebsite': None, 'businessName': 'Business Name, LLC'}
True
:param rowdata: ...
:return: ...
"""
d = dict.fromkeys(("businessDba", "businessPhone",
"businessEmail", "businessWebsite"))
name, *others = rowdata.split(':') # destructuring assignment
d["businessName"] = name.strip()
if not others:
return d
if "-" in others[0]:
d["businessDba"] = d["businessName"]
else:
d["businessDba"] = others[0].strip()
others.pop(0) # consume others[0]
for data in others:
try:
key, value = data.split('-', 1) # a- b-c => a, b-c
except ValueError: # too many/not enough values to unpack
print("Element {} should have a dash".format(data))
else:
d["business" + key.strip()] = value.strip()
return d
代码并不完美,但比以前更清晰了,至少在我看来是这样。
方法总结:
- 编写单元测试以保护行为;
- 制作保留行为的小t运行形式和提高可读性。分解你可以和不关注性能的因素;
- 继续,直到你有清楚的东西/当你绕圈子做不必要的修改时停下来;
- 如有必要,提高性能。