从文件中提取 IP 地址
Extracting IP addresses from a file
我正在尝试从 Python 中的 asp
文件中提取 IP 地址,该文件如下所示:
onInternalNet = (
isInNet(hostDNS, "147.163.1.0", "255.255.0.0") ||
isInNet(hostDNS, "123.264.0.0", "255.255.0.0") ||
isInNet(hostDNS, "137.5.0.0", "255.0.0.0") ||
isInNet(hostDNS, "100.01.02.0", "255.0.0.0") ||
isInNet(hostDNS, "172.146.30.0", "255.240.0.0") ||
isInNet(hostDNS, "112.268.0.0", "255.255.0.0") ||
我是如何尝试使用正则表达式提取它们的:
if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
但是我收到一个错误:
Traceback (most recent call last):
File "pull_proxy.py", line 27, in <module>
write_to_file(extract_proxies(in_file), out_file)
File "pull_proxy.py", line 8, in extract_proxies
if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
File "C:\Python27\lib\re.py", line 194, in compile
return _compile(pattern, flags)
File "C:\Python27\lib\re.py", line 233, in _compile
bypass_cache = flags & DEBUG
TypeError: unsupported operand type(s) for &: 'str' and 'int'
我不明白为什么会出现该错误,我该如何处理这段代码才能使其按照我的意愿提取信息?
import re
def extract_proxies(in_file):
buffer = []
for line in in_file:
if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
print "{} appened to buffer.".format(line)
buffer.append(line)
else:
pass
return buffer
def write_to_file(buffer, out_file):
for proxy in buffer:
with open(out_file, "a+") as res:
res.write(proxy)
if __name__ == '__main__':
print "Running...."
in_file = "C:/Users/thomas_j_perkins/Downloads/test.asp"
out_file = "c:/users/thomas_j_perkins/Downloads/results.txt"
write_to_file(extract_proxies(in_file), out_file)
编辑
意识到我没有打开文件:
import re
def extract_proxies(in_file):
buffer = []
for line in in_file:
if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
print "{} appened to buffer.".format(line)
buffer.append(line)
else:
pass
in_file.close()
return buffer
def write_to_file(buffer, out_file):
for proxy in buffer:
with open(out_file, "a+") as res:
res.write(proxy)
if __name__ == '__main__':
print "Running...."
in_file = "C:/Users/thomas_j_perkins/Downloads/PAC-Global-Vista.asp"
out_file = "c:/users/thomas_j_perkins/Downloads/results.txt"
write_to_file(extract_proxies(open(in_file, "r+")), out_file)
仍然出现同样的错误:
Running....
Traceback (most recent call last):
File "pull_proxy.py", line 28, in <module>
write_to_file(extract_proxies(open(in_file)), out_file)
File "pull_proxy.py", line 8, in extract_proxies
if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
File "C:\Python27\lib\re.py", line 194, in compile
return _compile(pattern, flags)
File "C:\Python27\lib\re.py", line 233, in _compile
bypass_cache = flags & DEBUG
TypeError: unsupported operand type(s) for &: 'str' and 'int'
re.compile
需要一个合适的 flags
参数(整数),而 line
(字符串)不是。
你应该做 re.match
而不是 re.compile
:
Compile a regular expression pattern into a regular expression object,
which can be used for matching using its match()
and search()
methods...
你的初始错误
TypeError: unsupported operand type(s) for &: 'str' and 'int'
正是@Moses 在他的回答中所说的造成的。标志应该是 int 值,而不是字符串。
你应该编译你的正则表达式一次。此外,在遍历行时需要使用打开的文件句柄。
重新导入
IP_MATCHER = re.compile(r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})")
def extract_proxies(fh):
for line in fh:
line = line.strip()
match = IP_MATCHER.findall(line)
if match:
print "{} appened to buffer.".format(line)
print match
else:
pass
def write_to_file(buffer, out_file):
for proxy in buffer:
with open(out_file, "a+") as res:
res.write(proxy)
if __name__ == '__main__':
print "Running...."
in_file = "in.txt"
with open(in_file) as fh:
extract_proxies(fh)
这将找到所有匹配项,如果您只想要第一个,则使用 IP_MATCHER.search
和 match.groups()
。这当然是假设您确实想要提取 IP 地址。
例如:
def extract_proxies(fh):
for line in fh:
line = line.strip()
match = IP_MATCHER.findall(line)
if len(match) == 2:
print "{} appened to buffer.".format(line)
ip, mask = match
print "IP: %s => Mask: %s" % (ip, mask)
else:
pass
请检查以下代码:
做了一些改变
- re.compile - 正则表达式应该首先被编译然后可以与 'match/search/findall'.
一起使用
- 正则表达式不正确。在编写正则表达式时,我们需要从行首考虑。正则表达式没有直接匹配行之间的单词。
import re
def extract_proxies(in_file):
buffer1 = []
#Regex compiled here
m = re.compile(r'\s*\w+\(\w+,\s+\"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\"')
for line in in_file:
#Used here to match
r = m.match(line)
if r is not None:
print "{} appened to buffer.".format(line)
buffer1.append(r.group(1))
else:
pass
in_file.close()
return buffer1
def write_to_file(buffer1, out_file):
for proxy in buffer1:
with open(out_file, "a+") as res:
res.write(proxy+'\n')
if __name__ == '__main__':
print "Running...."
in_file = "sample.txt"
out_file = "results.txt"
write_to_file(extract_proxies(open(in_file)), out_file)
输出:
C:\Users\dinesh_pundkar\Desktop>python c.py
Running....
isInNet(hostDNS, "147.163.1.0", "255.255.0.0") ||
appened to buffer.
isInNet(hostDNS, "123.264.0.0", "255.255.0.0") ||
appened to buffer.
isInNet(hostDNS, "137.5.0.0", "255.0.0.0") ||
appened to buffer.
isInNet(hostDNS, "100.01.02.0", "255.0.0.0") ||
appened to buffer.
isInNet(hostDNS, "172.146.30.0", "255.240.0.0") ||
appened to buffer.
isInNet(hostDNS, "112.268.0.0", "255.255.0.0") || appened to buffer.
C:\Users\dinesh_pundkar\Desktop>python c.py
我正在尝试从 Python 中的 asp
文件中提取 IP 地址,该文件如下所示:
onInternalNet = (
isInNet(hostDNS, "147.163.1.0", "255.255.0.0") ||
isInNet(hostDNS, "123.264.0.0", "255.255.0.0") ||
isInNet(hostDNS, "137.5.0.0", "255.0.0.0") ||
isInNet(hostDNS, "100.01.02.0", "255.0.0.0") ||
isInNet(hostDNS, "172.146.30.0", "255.240.0.0") ||
isInNet(hostDNS, "112.268.0.0", "255.255.0.0") ||
我是如何尝试使用正则表达式提取它们的:
if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
但是我收到一个错误:
Traceback (most recent call last):
File "pull_proxy.py", line 27, in <module>
write_to_file(extract_proxies(in_file), out_file)
File "pull_proxy.py", line 8, in extract_proxies
if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
File "C:\Python27\lib\re.py", line 194, in compile
return _compile(pattern, flags)
File "C:\Python27\lib\re.py", line 233, in _compile
bypass_cache = flags & DEBUG
TypeError: unsupported operand type(s) for &: 'str' and 'int'
我不明白为什么会出现该错误,我该如何处理这段代码才能使其按照我的意愿提取信息?
import re
def extract_proxies(in_file):
buffer = []
for line in in_file:
if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
print "{} appened to buffer.".format(line)
buffer.append(line)
else:
pass
return buffer
def write_to_file(buffer, out_file):
for proxy in buffer:
with open(out_file, "a+") as res:
res.write(proxy)
if __name__ == '__main__':
print "Running...."
in_file = "C:/Users/thomas_j_perkins/Downloads/test.asp"
out_file = "c:/users/thomas_j_perkins/Downloads/results.txt"
write_to_file(extract_proxies(in_file), out_file)
编辑
意识到我没有打开文件:
import re
def extract_proxies(in_file):
buffer = []
for line in in_file:
if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
print "{} appened to buffer.".format(line)
buffer.append(line)
else:
pass
in_file.close()
return buffer
def write_to_file(buffer, out_file):
for proxy in buffer:
with open(out_file, "a+") as res:
res.write(proxy)
if __name__ == '__main__':
print "Running...."
in_file = "C:/Users/thomas_j_perkins/Downloads/PAC-Global-Vista.asp"
out_file = "c:/users/thomas_j_perkins/Downloads/results.txt"
write_to_file(extract_proxies(open(in_file, "r+")), out_file)
仍然出现同样的错误:
Running....
Traceback (most recent call last):
File "pull_proxy.py", line 28, in <module>
write_to_file(extract_proxies(open(in_file)), out_file)
File "pull_proxy.py", line 8, in extract_proxies
if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
File "C:\Python27\lib\re.py", line 194, in compile
return _compile(pattern, flags)
File "C:\Python27\lib\re.py", line 233, in _compile
bypass_cache = flags & DEBUG
TypeError: unsupported operand type(s) for &: 'str' and 'int'
re.compile
需要一个合适的 flags
参数(整数),而 line
(字符串)不是。
你应该做 re.match
而不是 re.compile
:
Compile a regular expression pattern into a regular expression object, which can be used for matching using its
match()
andsearch()
methods...
你的初始错误
TypeError: unsupported operand type(s) for &: 'str' and 'int'
正是@Moses 在他的回答中所说的造成的。标志应该是 int 值,而不是字符串。
你应该编译你的正则表达式一次。此外,在遍历行时需要使用打开的文件句柄。
重新导入
IP_MATCHER = re.compile(r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})")
def extract_proxies(fh):
for line in fh:
line = line.strip()
match = IP_MATCHER.findall(line)
if match:
print "{} appened to buffer.".format(line)
print match
else:
pass
def write_to_file(buffer, out_file):
for proxy in buffer:
with open(out_file, "a+") as res:
res.write(proxy)
if __name__ == '__main__':
print "Running...."
in_file = "in.txt"
with open(in_file) as fh:
extract_proxies(fh)
这将找到所有匹配项,如果您只想要第一个,则使用 IP_MATCHER.search
和 match.groups()
。这当然是假设您确实想要提取 IP 地址。
例如:
def extract_proxies(fh):
for line in fh:
line = line.strip()
match = IP_MATCHER.findall(line)
if len(match) == 2:
print "{} appened to buffer.".format(line)
ip, mask = match
print "IP: %s => Mask: %s" % (ip, mask)
else:
pass
请检查以下代码:
做了一些改变
- re.compile - 正则表达式应该首先被编译然后可以与 'match/search/findall'. 一起使用
- 正则表达式不正确。在编写正则表达式时,我们需要从行首考虑。正则表达式没有直接匹配行之间的单词。
import re
def extract_proxies(in_file):
buffer1 = []
#Regex compiled here
m = re.compile(r'\s*\w+\(\w+,\s+\"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\"')
for line in in_file:
#Used here to match
r = m.match(line)
if r is not None:
print "{} appened to buffer.".format(line)
buffer1.append(r.group(1))
else:
pass
in_file.close()
return buffer1
def write_to_file(buffer1, out_file):
for proxy in buffer1:
with open(out_file, "a+") as res:
res.write(proxy+'\n')
if __name__ == '__main__':
print "Running...."
in_file = "sample.txt"
out_file = "results.txt"
write_to_file(extract_proxies(open(in_file)), out_file)
输出:
C:\Users\dinesh_pundkar\Desktop>python c.py
Running....
isInNet(hostDNS, "147.163.1.0", "255.255.0.0") ||
appened to buffer.
isInNet(hostDNS, "123.264.0.0", "255.255.0.0") ||
appened to buffer.
isInNet(hostDNS, "137.5.0.0", "255.0.0.0") ||
appened to buffer.
isInNet(hostDNS, "100.01.02.0", "255.0.0.0") ||
appened to buffer.
isInNet(hostDNS, "172.146.30.0", "255.240.0.0") ||
appened to buffer.
isInNet(hostDNS, "112.268.0.0", "255.255.0.0") || appened to buffer.
C:\Users\dinesh_pundkar\Desktop>python c.py