请看我的问题,相信我很容易解决
Please see my problem, believe me it is easy to solve
我尝试在 spawn 子进程中实现异步和等待。但它没有用。请看这个
预期输出
*************
http://www.stevecostellolaw.com/
*************
http://www.stevecostellolaw.com/personal-injury.html
http://www.stevecostellolaw.com/personal-injury.html
*************
http://www.stevecostellolaw.com/#
http://www.stevecostellolaw.com/#
*************
http://www.stevecostellolaw.com/home.html
http://www.stevecostellolaw.com/home.html
*************
http://www.stevecostellolaw.com/about-us.html
http://www.stevecostellolaw.com/about-us.html
*************
http://www.stevecostellolaw.com/
http://www.stevecostellolaw.com/
*************
因为每次 spawn child 找到 await
它都会返回到 python 脚本并打印 *************
然后打印 URL。此处忽略相同 url 的 2 次打印。
我得到的输出
C:\Users\ASUS\Desktop\searchermc>node app.js
server running on port 3000
DevTools listening on ws://127.0.0.1:52966/devtools/browser/933c20c7-e295-4d84-a4b8-eeb5888ecbbf
[3020:120:0402/105304.190:ERROR:device_event_log_impl.cc(214)] [10:53:04.188] USB: usb_device_handle_win.cc:1056 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
[3020:120:0402/105304.190:ERROR:device_event_log_impl.cc(214)] [10:53:04.189] USB: usb_device_handle_win.cc:1056 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
*************
http://www.stevecostellolaw.com/
http://www.stevecostellolaw.com/personal-injury.html
http://www.stevecostellolaw.com/personal-injury.html
http://www.stevecostellolaw.com/#
http://www.stevecostellolaw.com/#
http://www.stevecostellolaw.com/home.html
http://www.stevecostellolaw.com/home.html
http://www.stevecostellolaw.com/about-us.html
http://www.stevecostellolaw.com/about-us.html
http://www.stevecostellolaw.com/
http://www.stevecostellolaw.com/
*************
请看下面的app.js
代码
// form submit request
app.post('/formsubmit', function(req, res){
csvData = req.files.csvfile.data.toString('utf8');
filteredArray = cleanArray(csvData.split(/\r?\n/))
csvData = get_array_string(filteredArray)
csvData = csvData.trim()
var keywords = req.body.keywords
keywords = keywords.trim()
// Send request to python script
var spawn = require('child_process').spawn;
var process = spawn('python', ["./webextraction.py", csvData, keywords, req.body.full_search])
var outarr = []
// process.stdout.on('data', (data) => {
// console.log(`stdout: ${data}`);
// });
process.stdout.on('data', async function(data){
console.log("\n ************* ")
console.log(data.toString().trim())
await outarr.push(data.toString().trim())
console.log("\n ************* ")
});
});
Python 函数,当 if 条件匹配
时在 URLs 中发送
# Function for searching keyword start
def search_keyword(href, search_key):
extension_list = ['mp3', 'jpg', 'exe', 'jpeg', 'png', 'pdf', 'vcf']
if(href.split('.')[-1] not in extension_list):
try:
content = selenium_calling(href)
soup = BeautifulSoup(content,'html.parser')
search_string = re.sub("\s+"," ", soup.body.text)
search_string = search_string.lower()
res = [ele for ele in search_key if(ele.lower() in search_string)]
outstr = getstring(res)
outstr = outstr.lstrip(", ")
if(len(res) > 0):
print(href)
found_results.append(href)
href_key_dict[href] = outstr
return 1
else:
notfound_results.append(href)
except Exception as err:
pass
我想做这一切是因为 python 脚本需要更多的时间来执行,因此每次都会出现超时错误,所以我想获得 python 脚本的中间输出在我的 nodejs 脚本中。你可以在下图中看到我遇到的错误。
我不确定我是否完全理解你想要做什么,但我会试一试,因为你似乎已经多次问过这个问题(这通常不是一个好主意) .我认为您的问题不够清晰 - 如果您能阐明您的最终目标是什么(即您希望它如何表现?),那将会有很大帮助
我想你在这里提到了两个不同的问题。首先是您希望在从脚本返回的每条单独数据之前放置一行新的“******”。这是不能依赖的东西 - 查看此问题的答案以获取更多详细信息:Order of process.stdout.on( 'data', ... ) and process.stderr.on( 'data', ... )。数据将以块的形式传递给您的标准输出处理程序,而不是逐行传递,并且可以一次提供任意数量的数据,具体取决于管道中当前有多少数据。
我最困惑的部分是你的措辞“在我的 nodejs 脚本中获取 python 脚本的中间输出”。不一定有任何“即时”数据 - 您不能依赖于在任何特定时间通过进程的标准输出处理程序传入的数据,它会以 Python 脚本本身和进程确定的速度向您提供数据它在 运行 中。话虽如此,听起来您的主要问题是 POST 上发生的超时。你永远不会结束你的请求——这就是你超时的原因。我假设您想等待第一块数据——不管它包含多少行——然后再发回响应。在这种情况下,您需要添加 res.send,如下所示:
// form submit request
app.post('/formsubmit', function(req, res){
csvData = req.files.csvfile.data.toString('utf8');
filteredArray = cleanArray(csvData.split(/\r?\n/))
csvData = get_array_string(filteredArray)
csvData = csvData.trim()
var keywords = req.body.keywords
keywords = keywords.trim()
// Send request to python script
var spawn = require('child_process').spawn;
var process = spawn('python', ["./webextraction.py", csvData, keywords, req.body.full_search])
var outarr = []
// process.stdout.on('data', (data) => {
// console.log(`stdout: ${data}`);
// });
// Keep track of whether we've already ended the request
let responseSent = false;
process.stdout.on('data', async function(data){
console.log("\n ************* ")
console.log(data.toString().trim())
outarr.push(data.toString().trim())
console.log("\n ************* ")
// If the request hasn't already been ended, send back the current output from the script
// and end the request
if (!responseSent) {
responseSent = true;
res.send(outarr);
}
});
});
我尝试在 spawn 子进程中实现异步和等待。但它没有用。请看这个
预期输出
*************
http://www.stevecostellolaw.com/
*************
http://www.stevecostellolaw.com/personal-injury.html
http://www.stevecostellolaw.com/personal-injury.html
*************
http://www.stevecostellolaw.com/#
http://www.stevecostellolaw.com/#
*************
http://www.stevecostellolaw.com/home.html
http://www.stevecostellolaw.com/home.html
*************
http://www.stevecostellolaw.com/about-us.html
http://www.stevecostellolaw.com/about-us.html
*************
http://www.stevecostellolaw.com/
http://www.stevecostellolaw.com/
*************
因为每次 spawn child 找到 await
它都会返回到 python 脚本并打印 *************
然后打印 URL。此处忽略相同 url 的 2 次打印。
我得到的输出
C:\Users\ASUS\Desktop\searchermc>node app.js
server running on port 3000
DevTools listening on ws://127.0.0.1:52966/devtools/browser/933c20c7-e295-4d84-a4b8-eeb5888ecbbf
[3020:120:0402/105304.190:ERROR:device_event_log_impl.cc(214)] [10:53:04.188] USB: usb_device_handle_win.cc:1056 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
[3020:120:0402/105304.190:ERROR:device_event_log_impl.cc(214)] [10:53:04.189] USB: usb_device_handle_win.cc:1056 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
*************
http://www.stevecostellolaw.com/
http://www.stevecostellolaw.com/personal-injury.html
http://www.stevecostellolaw.com/personal-injury.html
http://www.stevecostellolaw.com/#
http://www.stevecostellolaw.com/#
http://www.stevecostellolaw.com/home.html
http://www.stevecostellolaw.com/home.html
http://www.stevecostellolaw.com/about-us.html
http://www.stevecostellolaw.com/about-us.html
http://www.stevecostellolaw.com/
http://www.stevecostellolaw.com/
*************
请看下面的app.js
代码
// form submit request
app.post('/formsubmit', function(req, res){
csvData = req.files.csvfile.data.toString('utf8');
filteredArray = cleanArray(csvData.split(/\r?\n/))
csvData = get_array_string(filteredArray)
csvData = csvData.trim()
var keywords = req.body.keywords
keywords = keywords.trim()
// Send request to python script
var spawn = require('child_process').spawn;
var process = spawn('python', ["./webextraction.py", csvData, keywords, req.body.full_search])
var outarr = []
// process.stdout.on('data', (data) => {
// console.log(`stdout: ${data}`);
// });
process.stdout.on('data', async function(data){
console.log("\n ************* ")
console.log(data.toString().trim())
await outarr.push(data.toString().trim())
console.log("\n ************* ")
});
});
Python 函数,当 if 条件匹配
时在 URLs 中发送# Function for searching keyword start
def search_keyword(href, search_key):
extension_list = ['mp3', 'jpg', 'exe', 'jpeg', 'png', 'pdf', 'vcf']
if(href.split('.')[-1] not in extension_list):
try:
content = selenium_calling(href)
soup = BeautifulSoup(content,'html.parser')
search_string = re.sub("\s+"," ", soup.body.text)
search_string = search_string.lower()
res = [ele for ele in search_key if(ele.lower() in search_string)]
outstr = getstring(res)
outstr = outstr.lstrip(", ")
if(len(res) > 0):
print(href)
found_results.append(href)
href_key_dict[href] = outstr
return 1
else:
notfound_results.append(href)
except Exception as err:
pass
我想做这一切是因为 python 脚本需要更多的时间来执行,因此每次都会出现超时错误,所以我想获得 python 脚本的中间输出在我的 nodejs 脚本中。你可以在下图中看到我遇到的错误。
我不确定我是否完全理解你想要做什么,但我会试一试,因为你似乎已经多次问过这个问题(这通常不是一个好主意) .我认为您的问题不够清晰 - 如果您能阐明您的最终目标是什么(即您希望它如何表现?),那将会有很大帮助
我想你在这里提到了两个不同的问题。首先是您希望在从脚本返回的每条单独数据之前放置一行新的“******”。这是不能依赖的东西 - 查看此问题的答案以获取更多详细信息:Order of process.stdout.on( 'data', ... ) and process.stderr.on( 'data', ... )。数据将以块的形式传递给您的标准输出处理程序,而不是逐行传递,并且可以一次提供任意数量的数据,具体取决于管道中当前有多少数据。
我最困惑的部分是你的措辞“在我的 nodejs 脚本中获取 python 脚本的中间输出”。不一定有任何“即时”数据 - 您不能依赖于在任何特定时间通过进程的标准输出处理程序传入的数据,它会以 Python 脚本本身和进程确定的速度向您提供数据它在 运行 中。话虽如此,听起来您的主要问题是 POST 上发生的超时。你永远不会结束你的请求——这就是你超时的原因。我假设您想等待第一块数据——不管它包含多少行——然后再发回响应。在这种情况下,您需要添加 res.send,如下所示:
// form submit request
app.post('/formsubmit', function(req, res){
csvData = req.files.csvfile.data.toString('utf8');
filteredArray = cleanArray(csvData.split(/\r?\n/))
csvData = get_array_string(filteredArray)
csvData = csvData.trim()
var keywords = req.body.keywords
keywords = keywords.trim()
// Send request to python script
var spawn = require('child_process').spawn;
var process = spawn('python', ["./webextraction.py", csvData, keywords, req.body.full_search])
var outarr = []
// process.stdout.on('data', (data) => {
// console.log(`stdout: ${data}`);
// });
// Keep track of whether we've already ended the request
let responseSent = false;
process.stdout.on('data', async function(data){
console.log("\n ************* ")
console.log(data.toString().trim())
outarr.push(data.toString().trim())
console.log("\n ************* ")
// If the request hasn't already been ended, send back the current output from the script
// and end the request
if (!responseSent) {
responseSent = true;
res.send(outarr);
}
});
});