Python 中的简单页面蜘蛛 - SQLite 不会更新
Simple page spider in Python - SQLite won't update
我有一个非常简单的页面蜘蛛,它在给定页面上抓取单词并将单词计数存储在 SQLite 数据库中。尽管代码以 exit code 0
退出,但数据库不会更新任何条目。
我不知道我是雪盲还是我的代码本身就有问题。
这是项目的结构和代码:
- spider.py
- input.txt
- words.db
- 实用程序(文件夹):
- url_utilities.py
- database_utilities.py
spider.py
import argparse
from utilities import url_utilities, database_utilities
def main(database: str, url_list_file: str):
big_word_list = []
urls = url_utilities.load_urls_from_file(url_list_file)
for url in urls:
print(f"Reading {url}")
page_content = url_utilities.load_page(url=url)
words = url_utilities.scrape_page(page_contents=page_content)
big_word_list.extend(words)
# database code
path = "C:\Users\baduker\PycharmProjects\page_spider\words.db"
database_utilities.create_database(database_path=path)
database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-db", "--database", help="SQLite File Name")
parser.add_argument("-i", "--input", help="File with urls")
args = parser.parse_args()
database_file = args.database
input_file = args.input
main(database=database_file, url_list_file=input_file)
url_utilities.py
import re
import string
from urllib.request import urlopen
from bs4 import BeautifulSoup
def load_urls_from_file(file_path: str):
try:
with open("input.txt") as f:
content = f.readlines()
return content
except FileNotFoundError:
print(f"The file {file_path} could not be found.")
exit(2)
def load_page(url: str):
response = urlopen(url)
html = response.read().decode("utf-8")
return html
def scrape_page(page_contents: str):
chicken_noodle = BeautifulSoup(page_contents, "html.parser")
for script in chicken_noodle(["script", "style"]):
script.extract()
text = chicken_noodle.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
plain_text = ''.join(filter(lambda x: x in string.printable, text))
clean_words = []
words = plain_text.split(" ")
for word in words:
clean = True
for punctuation_marks in string.punctuation:
if punctuation_marks in word:
clean = False
if any(char.isdigit() for char in word):
clean = False
# at least two characters but no more than 10
if len(word) < 2 or len(word) > 10:
clean = False
if not re.match(r'^\w+$', word):
clean = False
if clean:
try:
clean_words.append(word.lower())
except UnicodeEncodeError:
print(".")
return clean_words
database_utilities.py
import sqlite3 as lite
def create_database(database_path: str):
conn = lite.connect(database_path)
with conn:
cur = conn.cursor()
cur.execute("drop table if exists words")
ddl = "create table words (word text not null primary key, usage_count int default 1 not null);"
cur.execute(ddl)
ddl = "create unique index words_word_uindex on words (word);"
cur.execute(ddl)
conn.close()
def save_words_to_database(database_path: str, words_list: list):
conn = lite.connect(database_path)
with conn:
cur = conn.cursor()
for word in words_list:
sql = "select count(word) from words where word='" + word + "';"
cur.execute(sql)
count = cur.fetchone()[0]
if count > 0:
sql = "update words set usage_count = usage_count + 1 where word='" + word + "';"
else:
sql = "insert into words(word) values ('" + word + "');"
cur.execute(sql)
conn.commit()
conn.close()
print(f"Database save complete!")
input.txt
https://en.wikipedia.org/wiki/Python_(programming_language)
https://en.wikipedia.org/wiki/Guido_van_Rossum
https://en.wikipedia.org/wiki/Benevolent_dictator_for_life
您的代码似乎有效。
我怀疑您的数据库文件存在权限问题。
确保此行指向您有权写入的文件夹:
path = "C:\Users\baduker\PycharmProjects\page_spider\words.db"
或者只是删除路径并查看它是否有效。
path = "words.db"
你的上下文管理器,即
with con:
希望你在关闭它之前提交。我的意思是你应该用块本身来承诺。
你应该在你的数据库实用程序文件中这样做。
我有一个非常简单的页面蜘蛛,它在给定页面上抓取单词并将单词计数存储在 SQLite 数据库中。尽管代码以 exit code 0
退出,但数据库不会更新任何条目。
我不知道我是雪盲还是我的代码本身就有问题。
这是项目的结构和代码:
- spider.py
- input.txt
- words.db
- 实用程序(文件夹):
- url_utilities.py
- database_utilities.py
spider.py
import argparse
from utilities import url_utilities, database_utilities
def main(database: str, url_list_file: str):
big_word_list = []
urls = url_utilities.load_urls_from_file(url_list_file)
for url in urls:
print(f"Reading {url}")
page_content = url_utilities.load_page(url=url)
words = url_utilities.scrape_page(page_contents=page_content)
big_word_list.extend(words)
# database code
path = "C:\Users\baduker\PycharmProjects\page_spider\words.db"
database_utilities.create_database(database_path=path)
database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-db", "--database", help="SQLite File Name")
parser.add_argument("-i", "--input", help="File with urls")
args = parser.parse_args()
database_file = args.database
input_file = args.input
main(database=database_file, url_list_file=input_file)
url_utilities.py
import re
import string
from urllib.request import urlopen
from bs4 import BeautifulSoup
def load_urls_from_file(file_path: str):
try:
with open("input.txt") as f:
content = f.readlines()
return content
except FileNotFoundError:
print(f"The file {file_path} could not be found.")
exit(2)
def load_page(url: str):
response = urlopen(url)
html = response.read().decode("utf-8")
return html
def scrape_page(page_contents: str):
chicken_noodle = BeautifulSoup(page_contents, "html.parser")
for script in chicken_noodle(["script", "style"]):
script.extract()
text = chicken_noodle.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
plain_text = ''.join(filter(lambda x: x in string.printable, text))
clean_words = []
words = plain_text.split(" ")
for word in words:
clean = True
for punctuation_marks in string.punctuation:
if punctuation_marks in word:
clean = False
if any(char.isdigit() for char in word):
clean = False
# at least two characters but no more than 10
if len(word) < 2 or len(word) > 10:
clean = False
if not re.match(r'^\w+$', word):
clean = False
if clean:
try:
clean_words.append(word.lower())
except UnicodeEncodeError:
print(".")
return clean_words
database_utilities.py
import sqlite3 as lite
def create_database(database_path: str):
conn = lite.connect(database_path)
with conn:
cur = conn.cursor()
cur.execute("drop table if exists words")
ddl = "create table words (word text not null primary key, usage_count int default 1 not null);"
cur.execute(ddl)
ddl = "create unique index words_word_uindex on words (word);"
cur.execute(ddl)
conn.close()
def save_words_to_database(database_path: str, words_list: list):
conn = lite.connect(database_path)
with conn:
cur = conn.cursor()
for word in words_list:
sql = "select count(word) from words where word='" + word + "';"
cur.execute(sql)
count = cur.fetchone()[0]
if count > 0:
sql = "update words set usage_count = usage_count + 1 where word='" + word + "';"
else:
sql = "insert into words(word) values ('" + word + "');"
cur.execute(sql)
conn.commit()
conn.close()
print(f"Database save complete!")
input.txt
https://en.wikipedia.org/wiki/Python_(programming_language)
https://en.wikipedia.org/wiki/Guido_van_Rossum
https://en.wikipedia.org/wiki/Benevolent_dictator_for_life
您的代码似乎有效。
我怀疑您的数据库文件存在权限问题。 确保此行指向您有权写入的文件夹:
path = "C:\Users\baduker\PycharmProjects\page_spider\words.db"
或者只是删除路径并查看它是否有效。
path = "words.db"
你的上下文管理器,即
with con:
希望你在关闭它之前提交。我的意思是你应该用块本身来承诺。 你应该在你的数据库实用程序文件中这样做。