优化此 .NET 爬虫算法的技巧
Tips to optimize this .NET crawler algorithm
我正在编写类似网络爬虫的东西,它的引擎遵循以下步骤:
- 正在阅读 Rss Link(参数)
- 定义 Rss 项目列表
- 通过单独的查询
检查每个link在数据库(SQL服务器)中的存在
如果 link 是新的,它将通过单独的查询将字段插入数据库
Public Sub MyTickHandler()
Dim NewItems As New List(Of Structures.RSSItem)
Dim founded As Boolean = False
NewItems = RssReader.ParseRssFile(RssURL)
Dim connString = Configs.NewsDBConnection
Dim myConnection As SqlConnection = New SqlConnection("Server=localhost;Database=db;Integrated Security=SSPI;;Connection Timeout=45;Max Pool Size= 300")
myConnection.Open()
For Each item In NewItems
Dim cmdString As String = "SELECT id FROM posts with (nolock) WHERE link LIKE '" & item.link.Trim.ToLower & "'"
Dim TheCommand As SqlCommand = New SqlCommand(cmdString, myConnection)
Dim result = TheCommand.ExecuteScalar()
If result Is Nothing Then
TheCommand = New SqlCommand("INSERT INTO posts (link) VALUES ('" & item.link.ToLower.Trim & "')")
TheCommand.Connection = myConnection
TheCommand.ExecuteNonQuery()
TheCommand = New SqlCommand("INSERT INTO queue (link,descrip,site,title,category) VALUES ('" & item.link.ToLower.Trim & "','" & StringToBase64(item.description) & "','" & RssSite & "','" & StringToBase64(item.title) & "','" & RssCategory & "')")
TheCommand.Connection = myConnection
TheCommand.ExecuteNonQuery()
End If
TheCommand.Dispose()
Next
myConnection.Close()
myConnection.Dispose()
SqlConnection.ClearPool(myConnection)
End Sub
这非常适合单次通话。
但我有关于 150 Rss links 的一些信息,我应该通过线程 每 2 分钟检查一次 ,所以通过增加装载 SQL 查询,此进程以及 sql 服务器将不会响应并且应用程序崩溃!
我尝试了一些技巧,例如增加 sql 服务器响应超时,但根本没有帮助。
这个过程有什么更好的方法或技巧吗?
谢谢
- 在 for-each 循环之外只进行一次提取:
SELECT id, link FROM posts with (nolock) WHERE link in (@listOfLowerCaseLinks)
Dim myListOfLinks As New List(Of String)
...
TheCommand.Parameters.AddWithValue("@listOfLowerCaseLinks", myListOfLinks)
- 将整个插入操作(整个 for-each 循环)包装到一个 sql 事务中。这样,数据库就不必在中间提交。
我建议您将 table 值的参数传递给此任务的存储过程。这将允许在一次调用中插入整个列表。以下是您可以调整实际列长度的示例。在帖子 table 的 link 列上建立索引很重要。我假设 link 在这个例子中是唯一的。
T-SQL 创建 table 类型和过程:
CREATE TYPE dbo.linkInfo AS TABLE(
link varchar(255) NOT NULL PRIMARY KEY
,descrip varchar(255)
,title varchar(255)
);
GO
ALTER PROC dbo.usp_InsertRssItems
@site varchar(255)
,@category varchar(255)
,@linkInfo dbo.linkInfo READONLY
AS
SET NOCOUNT ON;
DECLARE @InsertedPosts TABLE(link varchar(255));
INSERT INTO dbo.posts(link)
OUTPUT inserted.link INTO @InsertedPosts
SELECT link
FROM @linkInfo AS li
WHERE NOT EXISTS(
SELECT *
FROM dbo.posts AS p
WHERE p.link = li.link
);
INSERT INTO dbo.queue(link,descrip,site,title,category)
SELECT li.link, li.descrip, @site,li. title, @category
FROM @linkInfo AS li
WHERE EXISTS(
SELECT *
FROM @InsertedPosts AS ip
WHERE ip.link = li.link
);
GO
示例 VB.NET 代码:
Sub MyTickHandler()
Dim NewItems As New List(Of Structures.RssItem)
Dim founded As Boolean = False
NewItems = RssReader.ParseRssFile(RssURL)
Dim dt = getNewRssItemDataTable(NewItems)
Dim connString = Configs.NewsDBConnection
Dim myConnection As SqlConnection = New SqlConnection("Server=localhost;Database=db;Integrated Security=SSPI;;Connection Timeout=45;Max Pool Size= 300")
Dim TheCommand As SqlCommand = New SqlCommand("dbo.usp_InsertRssItems", myConnection)
TheCommand.Parameters.Add(New SqlParameter("@site", SqlDbType.VarChar, 255)).Value = "z"
TheCommand.Parameters.Add(New SqlParameter("@category", SqlDbType.VarChar, 255)).Value = "z"
TheCommand.Parameters.Add(New SqlParameter("@linkInfo", SqlDbType.Structured)).Value = dt
TheCommand.CommandType = CommandType.StoredProcedure
myConnection.Open()
TheCommand.ExecuteNonQuery()
myConnection.Close()
myConnection.Dispose()
End Sub
Private Function getNewRssItemDataTable(NewRssItems As List(Of Structures.RssItem)) As DataTable
Dim dt As New DataTable
dt.Columns.Add("link", GetType(String)).MaxLength = 255
dt.Columns.Add("descrip", GetType(String)).MaxLength = 255
dt.Columns.Add("title", GetType(String)).MaxLength = 255
For Each NewRssItem In NewRssItems
Dim row = dt.NewRow
dt.Rows.Add(row)
row(0) = NewRssItem.link
row(1) = NewRssItem.description
row(2) = NewRssItem.title
Next NewRssItem
Return dt
End Function
编辑:
我看到你提到你想要一个 SqlBulkCopy 示例。如果插入是无条件的,你可以使用这个技巧:
Sub executeBulkInsert(connectionString As String, site As String, category As String, NewRssItems As List(Of Structures.RssItem))
Dim dt As New DataTable
dt.Columns.Add("link", GetType(String)).MaxLength = 255
dt.Columns.Add("descrip", GetType(String)).MaxLength = 255
dt.Columns.Add("site", GetType(String)).MaxLength = 255
dt.Columns.Add("title", GetType(String)).MaxLength = 255
dt.Columns.Add("category", GetType(String)).MaxLength = 255
For Each NewRssItem In NewRssItems
Dim row = dt.NewRow
dt.Rows.Add(row)
row(0) = site
row(1) = category
row(2) = NewRssItem.link
row(3) = NewRssItem.description
row(4) = NewRssItem.title
Next NewRssItem
Dim bcp = New SqlBulkCopy(connectionString)
bcp.DestinationTableName = "dbo.queue"
bcp.WriteToServer(dt)
End Sub
我正在编写类似网络爬虫的东西,它的引擎遵循以下步骤:
- 正在阅读 Rss Link(参数)
- 定义 Rss 项目列表
- 通过单独的查询 检查每个link在数据库(SQL服务器)中的存在
如果 link 是新的,它将通过单独的查询将字段插入数据库
Public Sub MyTickHandler() Dim NewItems As New List(Of Structures.RSSItem) Dim founded As Boolean = False NewItems = RssReader.ParseRssFile(RssURL) Dim connString = Configs.NewsDBConnection Dim myConnection As SqlConnection = New SqlConnection("Server=localhost;Database=db;Integrated Security=SSPI;;Connection Timeout=45;Max Pool Size= 300") myConnection.Open() For Each item In NewItems Dim cmdString As String = "SELECT id FROM posts with (nolock) WHERE link LIKE '" & item.link.Trim.ToLower & "'" Dim TheCommand As SqlCommand = New SqlCommand(cmdString, myConnection) Dim result = TheCommand.ExecuteScalar() If result Is Nothing Then TheCommand = New SqlCommand("INSERT INTO posts (link) VALUES ('" & item.link.ToLower.Trim & "')") TheCommand.Connection = myConnection TheCommand.ExecuteNonQuery() TheCommand = New SqlCommand("INSERT INTO queue (link,descrip,site,title,category) VALUES ('" & item.link.ToLower.Trim & "','" & StringToBase64(item.description) & "','" & RssSite & "','" & StringToBase64(item.title) & "','" & RssCategory & "')") TheCommand.Connection = myConnection TheCommand.ExecuteNonQuery() End If TheCommand.Dispose() Next myConnection.Close() myConnection.Dispose() SqlConnection.ClearPool(myConnection) End Sub
这非常适合单次通话。
但我有关于 150 Rss links 的一些信息,我应该通过线程 每 2 分钟检查一次 ,所以通过增加装载 SQL 查询,此进程以及 sql 服务器将不会响应并且应用程序崩溃!
我尝试了一些技巧,例如增加 sql 服务器响应超时,但根本没有帮助。
这个过程有什么更好的方法或技巧吗?
谢谢
- 在 for-each 循环之外只进行一次提取:
SELECT id, link FROM posts with (nolock) WHERE link in (@listOfLowerCaseLinks)
Dim myListOfLinks As New List(Of String)
...
TheCommand.Parameters.AddWithValue("@listOfLowerCaseLinks", myListOfLinks)
- 将整个插入操作(整个 for-each 循环)包装到一个 sql 事务中。这样,数据库就不必在中间提交。
我建议您将 table 值的参数传递给此任务的存储过程。这将允许在一次调用中插入整个列表。以下是您可以调整实际列长度的示例。在帖子 table 的 link 列上建立索引很重要。我假设 link 在这个例子中是唯一的。
T-SQL 创建 table 类型和过程:
CREATE TYPE dbo.linkInfo AS TABLE(
link varchar(255) NOT NULL PRIMARY KEY
,descrip varchar(255)
,title varchar(255)
);
GO
ALTER PROC dbo.usp_InsertRssItems
@site varchar(255)
,@category varchar(255)
,@linkInfo dbo.linkInfo READONLY
AS
SET NOCOUNT ON;
DECLARE @InsertedPosts TABLE(link varchar(255));
INSERT INTO dbo.posts(link)
OUTPUT inserted.link INTO @InsertedPosts
SELECT link
FROM @linkInfo AS li
WHERE NOT EXISTS(
SELECT *
FROM dbo.posts AS p
WHERE p.link = li.link
);
INSERT INTO dbo.queue(link,descrip,site,title,category)
SELECT li.link, li.descrip, @site,li. title, @category
FROM @linkInfo AS li
WHERE EXISTS(
SELECT *
FROM @InsertedPosts AS ip
WHERE ip.link = li.link
);
GO
示例 VB.NET 代码:
Sub MyTickHandler()
Dim NewItems As New List(Of Structures.RssItem)
Dim founded As Boolean = False
NewItems = RssReader.ParseRssFile(RssURL)
Dim dt = getNewRssItemDataTable(NewItems)
Dim connString = Configs.NewsDBConnection
Dim myConnection As SqlConnection = New SqlConnection("Server=localhost;Database=db;Integrated Security=SSPI;;Connection Timeout=45;Max Pool Size= 300")
Dim TheCommand As SqlCommand = New SqlCommand("dbo.usp_InsertRssItems", myConnection)
TheCommand.Parameters.Add(New SqlParameter("@site", SqlDbType.VarChar, 255)).Value = "z"
TheCommand.Parameters.Add(New SqlParameter("@category", SqlDbType.VarChar, 255)).Value = "z"
TheCommand.Parameters.Add(New SqlParameter("@linkInfo", SqlDbType.Structured)).Value = dt
TheCommand.CommandType = CommandType.StoredProcedure
myConnection.Open()
TheCommand.ExecuteNonQuery()
myConnection.Close()
myConnection.Dispose()
End Sub
Private Function getNewRssItemDataTable(NewRssItems As List(Of Structures.RssItem)) As DataTable
Dim dt As New DataTable
dt.Columns.Add("link", GetType(String)).MaxLength = 255
dt.Columns.Add("descrip", GetType(String)).MaxLength = 255
dt.Columns.Add("title", GetType(String)).MaxLength = 255
For Each NewRssItem In NewRssItems
Dim row = dt.NewRow
dt.Rows.Add(row)
row(0) = NewRssItem.link
row(1) = NewRssItem.description
row(2) = NewRssItem.title
Next NewRssItem
Return dt
End Function
编辑:
我看到你提到你想要一个 SqlBulkCopy 示例。如果插入是无条件的,你可以使用这个技巧:
Sub executeBulkInsert(connectionString As String, site As String, category As String, NewRssItems As List(Of Structures.RssItem))
Dim dt As New DataTable
dt.Columns.Add("link", GetType(String)).MaxLength = 255
dt.Columns.Add("descrip", GetType(String)).MaxLength = 255
dt.Columns.Add("site", GetType(String)).MaxLength = 255
dt.Columns.Add("title", GetType(String)).MaxLength = 255
dt.Columns.Add("category", GetType(String)).MaxLength = 255
For Each NewRssItem In NewRssItems
Dim row = dt.NewRow
dt.Rows.Add(row)
row(0) = site
row(1) = category
row(2) = NewRssItem.link
row(3) = NewRssItem.description
row(4) = NewRssItem.title
Next NewRssItem
Dim bcp = New SqlBulkCopy(connectionString)
bcp.DestinationTableName = "dbo.queue"
bcp.WriteToServer(dt)
End Sub