Python 中的简单页面蜘蛛 - SQLite 不会更新

Question

我有一个非常简单的页面蜘蛛，它在给定页面上抓取单词并将单词计数存储在 SQLite 数据库中。尽管代码以 exit code 0 退出，但数据库不会更新任何条目。

我不知道我是雪盲还是我的代码本身就有问题。

这是项目的结构和代码：

spider.py
input.txt
words.db
- 实用程序（文件夹）：
- url_utilities.py
- database_utilities.py

spider.py

import argparse
from utilities import url_utilities, database_utilities


def main(database: str, url_list_file: str):
    big_word_list = []
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print(f"Reading {url}")
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    # database code
    path = "C:\Users\baduker\PycharmProjects\page_spider\words.db"
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-db", "--database", help="SQLite File Name")
    parser.add_argument("-i", "--input", help="File with urls")
    args = parser.parse_args()
    database_file = args.database
    input_file = args.input

    main(database=database_file, url_list_file=input_file)

url_utilities.py

import re
import string
from urllib.request import urlopen

from bs4 import BeautifulSoup


def load_urls_from_file(file_path: str):
    try:
        with open("input.txt") as f:
            content = f.readlines()
            return content
    except FileNotFoundError:
        print(f"The file {file_path} could not be found.")
        exit(2)


def load_page(url: str):
    response = urlopen(url)
    html = response.read().decode("utf-8")
    return html


def scrape_page(page_contents: str):
    chicken_noodle = BeautifulSoup(page_contents, "html.parser")

    for script in chicken_noodle(["script", "style"]):
        script.extract()

    text = chicken_noodle.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

    text = ' '.join(chunk for chunk in chunks if chunk)
    plain_text = ''.join(filter(lambda x: x in string.printable, text))

    clean_words = []
    words = plain_text.split(" ")
    for word in words:
        clean = True
        for punctuation_marks in string.punctuation:
            if punctuation_marks in word:
                clean = False
            if any(char.isdigit() for char in word):
                clean = False
                # at least two characters but no more than 10
            if len(word) < 2 or len(word) > 10:
                clean = False
            if not re.match(r'^\w+$', word):
                clean = False
            if clean:
                try:
                    clean_words.append(word.lower())
                except UnicodeEncodeError:
                    print(".")
    return clean_words

database_utilities.py

import sqlite3 as lite


def create_database(database_path: str):
    conn = lite.connect(database_path)
    with conn:
        cur = conn.cursor()
        cur.execute("drop table if exists words")
        ddl = "create table words (word text not null primary key, usage_count int default 1 not null);"
        cur.execute(ddl)
        ddl = "create unique index words_word_uindex on words (word);"
        cur.execute(ddl)
    conn.close()


def save_words_to_database(database_path: str, words_list: list):
    conn = lite.connect(database_path)
    with conn:
        cur = conn.cursor()
        for word in words_list:
            sql = "select count(word) from words where word='" + word + "';"
            cur.execute(sql)
            count = cur.fetchone()[0]
            if count > 0:
                sql = "update words set usage_count = usage_count + 1 where word='" + word + "';"
            else:
                sql = "insert into words(word) values ('" + word + "');"
            cur.execute(sql)
    conn.commit()
    conn.close()
    print(f"Database save complete!")

input.txt

https://en.wikipedia.org/wiki/Python_(programming_language)
https://en.wikipedia.org/wiki/Guido_van_Rossum
https://en.wikipedia.org/wiki/Benevolent_dictator_for_life

Answer 1

您的代码似乎有效。

我怀疑您的数据库文件存在权限问题。确保此行指向您有权写入的文件夹：

path = "C:\Users\baduker\PycharmProjects\page_spider\words.db"

或者只是删除路径并查看它是否有效。

path = "words.db"

Answer 2

你的上下文管理器，即

with con:

希望你在关闭它之前提交。我的意思是你应该用块本身来承诺。你应该在你的数据库实用程序文件中这样做。

Python 中的简单页面蜘蛛 - SQLite 不会更新

Simple page spider in Python - SQLite won't update

python

database

sqlite

web-crawler