从文件中提取 IP 地址

Question

我正在尝试从 Python 中的 asp 文件中提取 IP 地址，该文件如下所示：

onInternalNet = (
        isInNet(hostDNS, "147.163.1.0", "255.255.0.0") ||
        isInNet(hostDNS, "123.264.0.0", "255.255.0.0") ||
        isInNet(hostDNS, "137.5.0.0", "255.0.0.0") ||
        isInNet(hostDNS, "100.01.02.0", "255.0.0.0") ||
        isInNet(hostDNS, "172.146.30.0", "255.240.0.0") ||
        isInNet(hostDNS, "112.268.0.0", "255.255.0.0") ||

我是如何尝试使用正则表达式提取它们的：

if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):

但是我收到一个错误：

Traceback (most recent call last):
  File "pull_proxy.py", line 27, in <module>
    write_to_file(extract_proxies(in_file), out_file)
  File "pull_proxy.py", line 8, in extract_proxies
    if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
  File "C:\Python27\lib\re.py", line 194, in compile
    return _compile(pattern, flags)
  File "C:\Python27\lib\re.py", line 233, in _compile
    bypass_cache = flags & DEBUG
TypeError: unsupported operand type(s) for &: 'str' and 'int'

我不明白为什么会出现该错误，我该如何处理这段代码才能使其按照我的意愿提取信息？

import re

def extract_proxies(in_file):
    buffer = []

    for line in in_file:
        if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
            print "{} appened to buffer.".format(line)
            buffer.append(line)
        else:
            pass

    return buffer

def write_to_file(buffer, out_file):
    for proxy in buffer:
        with open(out_file, "a+") as res:
            res.write(proxy)

if __name__ == '__main__':
    print "Running...."
    in_file = "C:/Users/thomas_j_perkins/Downloads/test.asp"
    out_file = "c:/users/thomas_j_perkins/Downloads/results.txt"
    write_to_file(extract_proxies(in_file), out_file)

编辑

意识到我没有打开文件：

import re

def extract_proxies(in_file):
    buffer = []

    for line in in_file:
        if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
            print "{} appened to buffer.".format(line)
            buffer.append(line)
        else:
            pass

    in_file.close()
    return buffer

def write_to_file(buffer, out_file):
    for proxy in buffer:
        with open(out_file, "a+") as res:
            res.write(proxy)

if __name__ == '__main__':
    print "Running...."
    in_file = "C:/Users/thomas_j_perkins/Downloads/PAC-Global-Vista.asp"
    out_file = "c:/users/thomas_j_perkins/Downloads/results.txt"
    write_to_file(extract_proxies(open(in_file, "r+")), out_file)

仍然出现同样的错误：

Running....
Traceback (most recent call last):
  File "pull_proxy.py", line 28, in <module>
    write_to_file(extract_proxies(open(in_file)), out_file)
  File "pull_proxy.py", line 8, in extract_proxies
    if re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", line):
  File "C:\Python27\lib\re.py", line 194, in compile
    return _compile(pattern, flags)
  File "C:\Python27\lib\re.py", line 233, in _compile
    bypass_cache = flags & DEBUG
TypeError: unsupported operand type(s) for &: 'str' and 'int'

Answer 1

re.compile 需要一个合适的 flags 参数（整数），而 line（字符串）不是。

你应该做 re.match 而不是 re.compile:

re.compile

Compile a regular expression pattern into a regular expression object, which can be used for matching using its match() and search() methods...

Answer 2

你的初始错误

TypeError: unsupported operand type(s) for &: 'str' and 'int'

正是@Moses 在他的回答中所说的造成的。标志应该是 int 值，而不是字符串。

你应该编译你的正则表达式一次。此外，在遍历行时需要使用打开的文件句柄。

重新导入

IP_MATCHER = re.compile(r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})")

def extract_proxies(fh):
    for line in fh:
        line = line.strip()
        match = IP_MATCHER.findall(line)
        if match:
            print "{} appened to buffer.".format(line)
            print match
        else:
            pass



def write_to_file(buffer, out_file):
    for proxy in buffer:
        with open(out_file, "a+") as res:
            res.write(proxy)


if __name__ == '__main__':
    print "Running...."
    in_file = "in.txt"
    with open(in_file) as fh:
        extract_proxies(fh)

这将找到所有匹配项，如果您只想要第一个，则使用 IP_MATCHER.search 和 match.groups()。这当然是假设您确实想要提取 IP 地址。

例如：

def extract_proxies(fh):
    for line in fh:
        line = line.strip()
        match = IP_MATCHER.findall(line)
        if len(match) == 2:
            print "{} appened to buffer.".format(line)
            ip, mask = match
            print "IP: %s => Mask: %s" % (ip, mask)
        else:
            pass

Answer 3

请检查以下代码：

做了一些改变

re.compile - 正则表达式应该首先被编译然后可以与 'match/search/findall'.
正则表达式不正确。在编写正则表达式时，我们需要从行首考虑。正则表达式没有直接匹配行之间的单词。

 import re


    def extract_proxies(in_file):
        buffer1 = []
        #Regex compiled here
        m = re.compile(r'\s*\w+\(\w+,\s+\"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\"')

        for line in in_file:
            #Used here to match
            r = m.match(line)
            if r is not None:
                print "{} appened to buffer.".format(line)
                buffer1.append(r.group(1))
            else:
                pass

        in_file.close()
        return buffer1


    def write_to_file(buffer1, out_file):
        for proxy in buffer1:
            with open(out_file, "a+") as res:
                res.write(proxy+'\n')


    if __name__ == '__main__':
        print "Running...."
        in_file = "sample.txt"
        out_file = "results.txt"
        write_to_file(extract_proxies(open(in_file)), out_file)

输出：

C:\Users\dinesh_pundkar\Desktop>python c.py
Running....
        isInNet(hostDNS, "147.163.1.0", "255.255.0.0") ||
 appened to buffer.
        isInNet(hostDNS, "123.264.0.0", "255.255.0.0") ||
 appened to buffer.
        isInNet(hostDNS, "137.5.0.0", "255.0.0.0") ||
 appened to buffer.
        isInNet(hostDNS, "100.01.02.0", "255.0.0.0") ||
 appened to buffer.
        isInNet(hostDNS, "172.146.30.0", "255.240.0.0") ||
 appened to buffer.
        isInNet(hostDNS, "112.268.0.0", "255.255.0.0") || appened to buffer.

C:\Users\dinesh_pundkar\Desktop>python c.py

从文件中提取 IP 地址

Extracting IP addresses from a file

python

extract

ipv4

python-2.7