如何通过浏览器扩展获取页面body标签的内容?
How to get the contents of the body tag of the page through the browser extension?
为什么变量体是空的?如何获取正文标签内容?
I am writing code for google chrome extension. I plan to use the
extension for personal use only. For web scraping.
To finally analyze the text. I want to play with text.
background.js
chrome.tabs.onUpdated.addListener(function(tabId, changeInfo, tab) {
if(document.readyState == "complete"){
var url = changeInfo.url;
// I want to save the url to a file(url.txt)
var body = document.body.innerText;
// why is body empty?????
var pattern = /[A-Z].*?\./g;
var result = text.match(pattern);
result.forEach(myFunction);
function myFunction(item) {
text += item+"\n";
}
// I want to save the text to a file(collection.txt)
}
});
manifest.json
{
"name": "Parser",
"version": "1",
"manifest_version": 2,
"background": {
"scripts":["background.js"]
},
"permissions": [
"tabs",
"activeTab",
"storage",
"http://*/*",
"https://*/*"
]
}
根据您发布的 manifest.json
看来您是 运行 您的代码作为后台脚本。
后台脚本无法直接访问加载的页面内容 - 这就是代码中正文为空的原因。
相反,您需要使用内容脚本来访问页面内容,然后将该数据发送到您的后台脚本进行处理。
这里是一个使用背景和内容脚本的示例设置,它可以让您在加载选项卡时检索和处理页面内容(未经测试,但应该为您指明正确的方向)。
感谢ResourceOverride extension,我在编写下面的示例时将其用作参考。
background.js
// background.js
chrome.runtime.onMessage.addListener(function(message, sender){
if (!message || typeof message !== 'object' || !sender.tab){
// Ignore messages that weren't sent by our content script.
return;
}
switch (message.action){
case 'receiveBodyText': {
processBodyText(sender.tab, message.bodyText);
break;
}
}
});
function processBodyText(tab, bodyText){
var url = tab.url;
// I want to save the url to a file(url.txt)
// TODO: Process your bodyText
var pattern = /[A-Z].*?\./g;
var result = text.match(pattern);
result.forEach(myFunction);
function myFunction(item) {
text += item+"\n";
}
// I want to save the text to a file(collection.txt)
}
content.js
// content.js
window.addEventListener('load', function(){
let bodyText = document.body.innerText;
chrome.runtime.sendMessage({
action: 'receiveBodyText',
bodyText: bodyText
});
});
manifest.json
// manifest.json
{
"name": "Parser",
"version": "1",
"manifest_version": 2,
"background": {
"scripts":["background.js"]
},
"content_scripts": [{
"matches" : [
"http://*/*",
"https://*/*"
],
"js": ["content.js"]
}],
"permissions": [
"tabs",
"activeTab",
"storage",
"http://*/*",
"https://*/*"
]
}
信息和文档
关于 WebExtension 的说明 API Firefox 和 chrome 之间的差异:
Chrome uses the chrome
namespace, Firefox uses the future-standard browser
namespace.
So code written in Chrome would use: chrome.tabs.onUpdated(...)
and the equivalent in Firefox would be: browser.tabs.onUpdated(...)
Be aware of that when reading the docs and reading example extensions.
后台脚本
- 无法访问加载的页面
- 可以完全访问 WebExtensions API
内容脚本
- 对加载的页面具有完全访问权限
- 对 WebExtensions 的访问权限有限 API
- Chrome content scripts docs
- MDN content scripts docs
- communication between content scripts and background scripts
上的文档
WebExtensions API
其他有用的链接
- MDN WebExtensions example Github repository
- MDN "Anatomy of a WebExtension"
- MDN detailed browser WebExtensions support tables
- ResourceOverride extension - 这是一个相当复杂的扩展,同时使用背景脚本和内容脚本。我将其用作 reference/example 以更好地理解扩展的编写方式。
为什么变量体是空的?如何获取正文标签内容?
I am writing code for google chrome extension. I plan to use the extension for personal use only. For web scraping. To finally analyze the text. I want to play with text.
background.js
chrome.tabs.onUpdated.addListener(function(tabId, changeInfo, tab) {
if(document.readyState == "complete"){
var url = changeInfo.url;
// I want to save the url to a file(url.txt)
var body = document.body.innerText;
// why is body empty?????
var pattern = /[A-Z].*?\./g;
var result = text.match(pattern);
result.forEach(myFunction);
function myFunction(item) {
text += item+"\n";
}
// I want to save the text to a file(collection.txt)
}
});
manifest.json
{
"name": "Parser",
"version": "1",
"manifest_version": 2,
"background": {
"scripts":["background.js"]
},
"permissions": [
"tabs",
"activeTab",
"storage",
"http://*/*",
"https://*/*"
]
}
根据您发布的 manifest.json
看来您是 运行 您的代码作为后台脚本。
后台脚本无法直接访问加载的页面内容 - 这就是代码中正文为空的原因。
相反,您需要使用内容脚本来访问页面内容,然后将该数据发送到您的后台脚本进行处理。
这里是一个使用背景和内容脚本的示例设置,它可以让您在加载选项卡时检索和处理页面内容(未经测试,但应该为您指明正确的方向)。
感谢ResourceOverride extension,我在编写下面的示例时将其用作参考。
background.js
// background.js
chrome.runtime.onMessage.addListener(function(message, sender){
if (!message || typeof message !== 'object' || !sender.tab){
// Ignore messages that weren't sent by our content script.
return;
}
switch (message.action){
case 'receiveBodyText': {
processBodyText(sender.tab, message.bodyText);
break;
}
}
});
function processBodyText(tab, bodyText){
var url = tab.url;
// I want to save the url to a file(url.txt)
// TODO: Process your bodyText
var pattern = /[A-Z].*?\./g;
var result = text.match(pattern);
result.forEach(myFunction);
function myFunction(item) {
text += item+"\n";
}
// I want to save the text to a file(collection.txt)
}
content.js
// content.js
window.addEventListener('load', function(){
let bodyText = document.body.innerText;
chrome.runtime.sendMessage({
action: 'receiveBodyText',
bodyText: bodyText
});
});
manifest.json
// manifest.json
{
"name": "Parser",
"version": "1",
"manifest_version": 2,
"background": {
"scripts":["background.js"]
},
"content_scripts": [{
"matches" : [
"http://*/*",
"https://*/*"
],
"js": ["content.js"]
}],
"permissions": [
"tabs",
"activeTab",
"storage",
"http://*/*",
"https://*/*"
]
}
信息和文档
关于 WebExtension 的说明 API Firefox 和 chrome 之间的差异:
Chrome uses the
chrome
namespace, Firefox uses the future-standardbrowser
namespace.So code written in Chrome would use:
chrome.tabs.onUpdated(...)
and the equivalent in Firefox would be:browser.tabs.onUpdated(...)
Be aware of that when reading the docs and reading example extensions.
后台脚本
- 无法访问加载的页面
- 可以完全访问 WebExtensions API
内容脚本
- 对加载的页面具有完全访问权限
- 对 WebExtensions 的访问权限有限 API
- Chrome content scripts docs
- MDN content scripts docs
- communication between content scripts and background scripts 上的文档
WebExtensions API
其他有用的链接
- MDN WebExtensions example Github repository
- MDN "Anatomy of a WebExtension"
- MDN detailed browser WebExtensions support tables
- ResourceOverride extension - 这是一个相当复杂的扩展,同时使用背景脚本和内容脚本。我将其用作 reference/example 以更好地理解扩展的编写方式。