如何从网页(标签内)的 HTML 页面源中提取数据?
How to extract data from HTML page source of (a tab within) a webpage?
我尝试了其他答案中指定的几种解决方案,例如尝试使用不同的用户代理(Chrome、safari 等),以及直接使用 HTTPClient 和 BufferedReader 获取 HTML,但是 none 他们的工作。如何使 Android 输出类似于 Web 输出?
这是我正在寻找的网络输出; (查看 https://finance.yahoo.com/quote/AAPL/financials?p=AAPL 的页面源以获得完整输出 - 这基本上包含名为 "Quarterly" 的 AJAX 选项卡 ,其中包含 table。我需要获取该数据,但 Android HTML 来源没有,但网络来源有。)
root.App.main = {"context":{"dispatcher":{"stores":{"PageStore":{"currentPageName":"quote","currentRenderTargetId":"default","pagesConfigRaw":{"base":{"quote":{"layout":{"bundleName":"yahoodotcom-layout.TwoColumnLayout","name":"TwoColumnLayout","config":{"enableHeaderCollapse":true,"Header":{"isFixed":true,"uhContainerClasses":"Bgi($uhGrayGradient)","navContainerClasses":"Bgi($navrailGrayGradient) Bxsh($navrailShadow) Pos(r) hasScrolled_Bxsh(headerShadow) Panel-open_Bxsh(headerShadow)","navTransitionClasses":"HideNavrail_Translate3d(0,-46px,0) Panel-open_Translate3d(0,-46px,0)","secondaryNavContainerClasses":"hasScrolled_Bdbw(0px) Bxsh($navrailShadow)","height":135},"fetchNewAttribution":true},"meta":{"property":{"twitter:site":"@YahooFinance"}}},"meta":{"property":{"twitter:site":"@YahooFinance","fb:pages":"90376669494"}},"regions":{"SecondaryNav":[{"bundleName":"react-finance","name":"SecondaryNav","config":{"ui":{"enableRelativeUrl":true}},"props":{"key":"SecondaryNav-0-SecondaryNav","id":"SecondaryNav-0-SecondaryNav"},"isPageComposite":true}],"Overlay":[{"bundleName":"react-lightbox","name":"Lightbox","props":{"key":"Overlay-0-Lightbox","id":"Overlay-0-Lightbox"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-1-Null","id":"Overlay-1-Null"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-2-Null","id":"Overlay-2-Null"},"isPageComposite":true}],"Lead":[{"bundleName":"react-finance","name":"FinanceHeader","props":{"className":"Bxz(bb) H(100%) Pos(r) Maw($newGridWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mstart(a) Mend(a) Px(20px) My(10px)","showAds":true,"adsConfig":{"positions":["FB2A","FB2B","FB2C","FB2D"]},"key":"Lead-0-FinanceHeader","id":"Lead-0-FinanceHeader"},"isPageComposite":true},{"bundleName":"tdv2-applet-featurebar","name":"FeatureBar","config":{"ui":{"container_classnames":"W(100%) Bxz(bb) Bdrs(2px) Mb(10px) Maw($maxModuleWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mx(a)","prerender":{"enabled":true,"renderTargetId":"modal"}},"site":"finance"},"props":{"key":"Lead-1-FeatureBar","id":"Lead-1-FeatureBar"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteHeader","props":{"key":"Lead-2-QuoteHeader","id":"Lead-2-QuoteHeader"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteNav","props":{"key":"Lead-3-QuoteNav","id":"Lead-3-QuoteNav"},"isPageComposite":true}],"Col1":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LDRB","style":{"marginBottom":"8px","paddingTop":"0px","marginLeft":"auto","marginRight":"auto","textAlign":"center","lineHeight":"0px","position":"relative","zIndex":"5"},"key":"Col1-0-Ad","id":"Col1-0-Ad"},"isPageComposite":true},{"bundleName":"Quote.financials","name":"Financials","props":{"key":"Col1-1-Financials","id":"Col1-1-Financials"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-foot","positions":["FOOT"],"key":"Col1-2-AdUnitWithTdAds","id":"Col1-2-AdUnitWithTdAds"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-fsrvy","positions":["FSRVY"],"key":"Col1-3-AdUnitWithTdAds","id":"Col1-3-AdUnitWithTdAds"},"isPageComposite":true}],"Col2":[{"bundleName":"td-app-finance","name":"ExtPromoButton","props":{"className":"btn Bds(s) Bdc($c-fuji-grey-c) Bdrs(4px) Bgc($white) Bdw(1px) Bgc($ExtButtonHov):h C($white):h C($ExtButtonHov) Cur(p) Fz(s) Fw(b) H(44px) Lh(40px) Mb(20px) Ta(c) Td(n) W(100%)","sec":"ext-promo-all-mkt-submit","titleId":"EXTENSION_PROMO_TITLE","url":"https:\u002F\u002Fchrome.google.com\u002Fwebstore\u002Fdetail\u002Fdoojmkhhplhicnghmafjbhncmgjiohma","enabled":true,"key":"Col2-0-ExtPromoButton","id":"Col2-0-ExtPromoButton"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"eventPromo","key":"Col2-1-QuoteModule","id":"Col2-1-QuoteModule"},"isPageComposite":true},{"bundleName":"td-ads","name":"ComboAd","props":{"adparseStyle":{"marginBottom":"20px"},"finishedStyle":{"marginBottom":"20px"},"children":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LREC"}},{"bundleName":"td-ads","name":"Ad","props":{"pos":"MON"}}],"serverHeight":true,"key":"Col2-2-ComboAd","id":"Col2-2-ComboAd"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"similarCompanies","key":"Col2-3-QuoteModule","id":"Col2-3-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"earningsChart","key":"Col2-4-QuoteModule","id":"Col2-4-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"financialsChart","key":"Col2-5-QuoteModule","id":"Col2-5-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"react-finance",..."}}}};
这是我得到的 Android 输出;
(root.App.main = {"context":{"dispatcher":{"stores":{"PageStore":{"currentPageName":"quote","currentRenderTargetId":"default","pagesConfigRaw":{"base":{"quote":{"layout":{"bundleName":"yahoodotcom-layout.TwoColumnLayout","name":"TwoColumnLayout","config":{"enableHeaderCollapse":true,"Header":{"isFixed":true,"uhContainerClasses":"Bgi($uhGrayGradient)","navContainerClasses":"Bgi($navrailGrayGradient) Bxsh($navrailShadow) Pos(r) hasScrolled_Bxsh(headerShadow) Panel-open_Bxsh(headerShadow)","navTransitionClasses":"HideNavrail_Translate3d(0,-46px,0) Panel-open_Translate3d(0,-46px,0)","secondaryNavContainerClasses":"hasScrolled_Bdbw(0px) Bxsh($navrailShadow)","height":135},"fetchNewAttribution":true},"meta":{"property":{"twitter:site":"@YahooFinance"}}},"meta":{"property":{"twitter:site":"@YahooFinance","fb:pages":"90376669494"}},"regions":{"SecondaryNav":[{"bundleName":"react-finance","name":"SecondaryNav","config":{"ui":{"enableRelativeUrl":true}},"props":{"key":"SecondaryNav-0-SecondaryNav","id":"SecondaryNav-0-SecondaryNav"},"isPageComposite":true}],"Overlay":[{"bundleName":"react-lightbox","name":"Lightbox","props":{"key":"Overlay-0-Lightbox","id":"Overlay-0-Lightbox"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-1-Null","id":"Overlay-1-Null"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-2-Null","id":"Overlay-2-Null"},"isPageComposite":true}],"Lead":[{"bundleName":"react-finance","name":"FinanceHeader","props":{"className":"Bxz(bb) H(100%) Pos(r) Maw($newGridWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mstart(a) Mend(a) Px(20px) My(10px)","showAds":true,"adsConfig":{"positions":["FB2A","FB2B","FB2C","FB2D"]},"key":"Lead-0-FinanceHeader","id":"Lead-0-FinanceHeader"},"isPageComposite":true},{"bundleName":"tdv2-applet-featurebar","name":"FeatureBar","config":{"ui":{"container_classnames":"W(100%) Bxz(bb) Bdrs(2px) Mb(10px) Maw($maxModuleWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mx(a)","prerender":{"enabled":true,"renderTargetId":"modal"}},"site":"finance"},"props":{"key":"Lead-1-FeatureBar","id":"Lead-1-FeatureBar"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteHeader","props":{"key":"Lead-2-QuoteHeader","id":"Lead-2-QuoteHeader"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteNav","props":{"key":"Lead-3-QuoteNav","id":"Lead-3-QuoteNav"},"isPageComposite":true}],"Col1":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LDRB","style":{"marginBottom":"8px","paddingTop":"0px","marginLeft":"auto","marginRight":"auto","textAlign":"center","lineHeight":"0px","position":"relative","zIndex":"5"},"key":"Col1-0-Ad","id":"Col1-0-Ad"},"isPageComposite":true},{"bundleName":"Quote.financials","name":"Financials","props":{"key":"Col1-1-Financials","id":"Col1-1-Financials"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-foot","positions":["FOOT"],"key":"Col1-2-AdUnitWithTdAds","id":"Col1-2-AdUnitWithTdAds"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-fsrvy","positions":["FSRVY"],"key":"Col1-3-AdUnitWithTdAds","id":"Col1-3-AdUnitWithTdAds"},"isPageComposite":true}],"Col2":[{"bundleName":"td-app-finance","name":"ExtPromoButton","props":{"className":"btn Bds(s) Bdc($c-fuji-grey-c) Bdrs(4px) Bgc($white) Bdw(1px) Bgc($ExtButtonHov):h C($white):h C($ExtButtonHov) Cur(p) Fz(s) Fw(b) H(44px) Lh(40px) Mb(20px) Ta(c) Td(n) W(100%)","sec":"ext-promo-all-mkt-submit","titleId":"EXTENSION_PROMO_TITLE","url":"https:\u002F\u002Fchrome.google.com\u002Fwebstore\u002Fdetail\u002Fdoojmkhhplhicnghmafjbhncmgjiohma","enabled":true,"key":"Col2-0-ExtPromoButton","id":"Col2-0-ExtPromoButton"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"eventPromo","key":"Col2-1-QuoteModule","id":"Col2-1-QuoteModule"},"isPageComposite":true}
你有什么建议吗?谢谢。
我的代码;
Document doc = Jsoup.connect(requestURL).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.43")
.timeout(600000).get();
Elements tableDivs = doc.getElementsByAttributeValue("class", myClassName);
Elements scriptTags = doc.getElementsByTag("script");
for (Element script : scriptTags) {
//System.out.println(script.data());
Log.e("ONE", script.data());
}
Yahoo Finance 重定向到 guce.oath.com,通知我们有关 cookie 和其他数据的使用,并要求在提供内容之前单击 'accept'。如果我们清除 cokies 并刷新页面,我们也可以在浏览器中观察到这一点。
我们可以从 guce.oath.com 中抓取 link,但我注意到最后的 URL 有一个 guccounter=2
参数,如果我们使用那个 URL 我们可以得到所需的响应。
String requestURL = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL&guccounter=2";
String userAgent = "My UAString";
Document doc = Jsoup.connect(requestURL).userAgent(userAgent).get();
由于数据不是HTML而是JavaScript代码,我们不能用jsoup
解析,但是可以用正则表达式。
Elements scriptTags = doc.getElementsByTag("script");
String re = "root\.App\.main\s*\=\s*(.*?);\s*\}\(this\)\)\s*;";
String data = null;
for (Element script : scriptTags) {
Pattern pattern = Pattern.compile(re, Pattern.DOTALL);
Matcher matcher = pattern.matcher(script.html());
if (matcher.find()) {
data = matcher.group(1);
break;
}
}
data
字符串应包含来自 JavaScript 代码的字典,这是一个有效的 json 字符串,可以用 JSONObject
解析。
然而,在 Android Studio 上,据我所知没有重定向。我试过几个用户代理字符串,但页面似乎是直接加载的。尽管如此,包含数据的 JavaScript 字典仍然存在,我们可以提取它,并用 JSONObject
解析它。
Android 工作室代码:
String requestURL = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL";
String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.43";
String row = "totalRevenue";
try {
Document doc = Jsoup.connect(requestURL).userAgent(userAgent).get();
String html = doc.html();
//Log.d("html", html);
Elements scriptTags = doc.getElementsByTag("script");
String re = "root\.App\.main\s*\=\s*(.*?);\s*\}\(this\)\)\s*;";
for (Element script : scriptTags) {
Pattern pattern = Pattern.compile(re, Pattern.DOTALL);
Matcher matcher = pattern.matcher(script.html());
if (matcher.find()) {
String data = matcher.group(1);
//Log.d("data", data);
JSONObject jo = new JSONObject(data);
JSONArray table = getTable(jo);
//Log.d("table", table.toString());
String[] tableRow = getRow(table, row);
String values = TextUtils.join(", ", tableRow);
Log.d("values", values);
}
}
} catch (Exception e) {
Log.e("err", "err", e);
}
这应该解析数据和 select 'Total Revenue' 值。我使用的 getTable
和 getRow
方法:
private JSONArray getTable(JSONObject json) throws JSONException {
JSONArray table = (JSONArray) json.getJSONObject("context")
.getJSONObject("dispatcher")
.getJSONObject("stores")
.getJSONObject("QuoteSummaryStore")
.getJSONObject("incomeStatementHistoryQuarterly")
.getJSONArray("incomeStatementHistory");
return table;
}
private String[] getRow(JSONArray table, String name) throws JSONException {
String[] values = new String[table.length()];
for (int i = 0; i < table.length(); i++) {
JSONObject jo = table.getJSONObject(i);
if (jo.has(name)) {
jo = jo.getJSONObject(name);
values[i] = jo.has("longFmt") ? jo.get("longFmt").toString() : "-";
} else {
values[i] = "-";
}
}
return values;
}
private String[] getDates(JSONArray table) throws JSONException {
String[] values = new String[table.length()];
for (int i = 0; i < table.length(); i++) {
values[i] = table.getJSONObject(i).getJSONObject("endDate")
.get("fmt").toString();
}
return values;
}
我认为获取 table 数据的最佳方法是将每个 html 行名称映射到一个 json 键。此外,主 table 有五个子 table,因此我们可以将每个嵌套的 table 映射到它包含的行。
Map<String, Map<String, String>> getTableNames() {
final Map<String, String> revenue = new LinkedHashMap<String, String>() {
{ put("Total Revenue", "totalRevenue"); }
{ put("Cost of Revenue", "costOfRevenue"); }
{ put("Gross Profit", "grossProfit"); }
};
final Map<String, String> operatingExpenses = new LinkedHashMap<String, String>() {
{ put("Research Development", "researchDevelopment"); }
{ put("Selling General and Administrative", "sellingGeneralAdministrative"); }
{ put("Non Recurring", "nonRecurring"); }
{ put("Others", "otherOperatingExpenses"); }
{ put("Total Operating Expenses", "totalOperatingExpenses"); }
{ put("Operating Income or Loss", "operatingIncome"); }
};
Map<String, Map<String, String>> allTableNames = new LinkedHashMap<String, Map<String, String>>() {
{ put("Revenue", revenue); }
{ put("Operating Expenses", operatingExpenses); }
};
return allTableNames;
}
我们可以将此映射用于 select 单个单元格,例如 6/30/2018 的 'Total Revenue'(位于第一行和第一列),
JSONObject jo = new JSONObject(jsData);
JSONArray table = getTable(jo);
Map<String, Map<String, String>> tableNames = getTableNames();
String totalRevenueKey = tableNames.get("Revenue").get("Total Revenue");
String[] totalRevenueValues = getRow(table, totalRevenueKey);
String value = totalRevenueValues[0];
或者我们可以遍历 table 名称并构建一个列表或字符串,其中包含所有 table 数据。
List<String> tableData = new ArrayList<>();
Map<String, Map<String, String>> tableNames = getTableNames();
String[] dates = getDates(table);
for (Map.Entry<String, Map<String, String>> tableEntry : tableNames.entrySet()) {
tableData.add(tableEntry.getKey());
tableData.addAll(Arrays.asList(dates));
for (Map.Entry<String, String> row : tableEntry.getValue().entrySet()) {
String[] tableRow = getRow(table, row.getValue());
tableData.add(row.getKey());
for (String column: tableRow) {
tableData.add(column);
}
}
}
String tableDataString = TextUtils.join(", ", tableData);
我已尝试尽可能匹配 html table,因此 tableData
列表和结果字符串的格式为 "table name, date,date,date,date" 和 "row name, price,price,price,price",但最好只包含数字。 (在这种情况下,我们应该只添加 tableRow
项到 tableData
)
我尝试了其他答案中指定的几种解决方案,例如尝试使用不同的用户代理(Chrome、safari 等),以及直接使用 HTTPClient 和 BufferedReader 获取 HTML,但是 none 他们的工作。如何使 Android 输出类似于 Web 输出? 这是我正在寻找的网络输出; (查看 https://finance.yahoo.com/quote/AAPL/financials?p=AAPL 的页面源以获得完整输出 - 这基本上包含名为 "Quarterly" 的 AJAX 选项卡 ,其中包含 table。我需要获取该数据,但 Android HTML 来源没有,但网络来源有。)
root.App.main = {"context":{"dispatcher":{"stores":{"PageStore":{"currentPageName":"quote","currentRenderTargetId":"default","pagesConfigRaw":{"base":{"quote":{"layout":{"bundleName":"yahoodotcom-layout.TwoColumnLayout","name":"TwoColumnLayout","config":{"enableHeaderCollapse":true,"Header":{"isFixed":true,"uhContainerClasses":"Bgi($uhGrayGradient)","navContainerClasses":"Bgi($navrailGrayGradient) Bxsh($navrailShadow) Pos(r) hasScrolled_Bxsh(headerShadow) Panel-open_Bxsh(headerShadow)","navTransitionClasses":"HideNavrail_Translate3d(0,-46px,0) Panel-open_Translate3d(0,-46px,0)","secondaryNavContainerClasses":"hasScrolled_Bdbw(0px) Bxsh($navrailShadow)","height":135},"fetchNewAttribution":true},"meta":{"property":{"twitter:site":"@YahooFinance"}}},"meta":{"property":{"twitter:site":"@YahooFinance","fb:pages":"90376669494"}},"regions":{"SecondaryNav":[{"bundleName":"react-finance","name":"SecondaryNav","config":{"ui":{"enableRelativeUrl":true}},"props":{"key":"SecondaryNav-0-SecondaryNav","id":"SecondaryNav-0-SecondaryNav"},"isPageComposite":true}],"Overlay":[{"bundleName":"react-lightbox","name":"Lightbox","props":{"key":"Overlay-0-Lightbox","id":"Overlay-0-Lightbox"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-1-Null","id":"Overlay-1-Null"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-2-Null","id":"Overlay-2-Null"},"isPageComposite":true}],"Lead":[{"bundleName":"react-finance","name":"FinanceHeader","props":{"className":"Bxz(bb) H(100%) Pos(r) Maw($newGridWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mstart(a) Mend(a) Px(20px) My(10px)","showAds":true,"adsConfig":{"positions":["FB2A","FB2B","FB2C","FB2D"]},"key":"Lead-0-FinanceHeader","id":"Lead-0-FinanceHeader"},"isPageComposite":true},{"bundleName":"tdv2-applet-featurebar","name":"FeatureBar","config":{"ui":{"container_classnames":"W(100%) Bxz(bb) Bdrs(2px) Mb(10px) Maw($maxModuleWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mx(a)","prerender":{"enabled":true,"renderTargetId":"modal"}},"site":"finance"},"props":{"key":"Lead-1-FeatureBar","id":"Lead-1-FeatureBar"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteHeader","props":{"key":"Lead-2-QuoteHeader","id":"Lead-2-QuoteHeader"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteNav","props":{"key":"Lead-3-QuoteNav","id":"Lead-3-QuoteNav"},"isPageComposite":true}],"Col1":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LDRB","style":{"marginBottom":"8px","paddingTop":"0px","marginLeft":"auto","marginRight":"auto","textAlign":"center","lineHeight":"0px","position":"relative","zIndex":"5"},"key":"Col1-0-Ad","id":"Col1-0-Ad"},"isPageComposite":true},{"bundleName":"Quote.financials","name":"Financials","props":{"key":"Col1-1-Financials","id":"Col1-1-Financials"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-foot","positions":["FOOT"],"key":"Col1-2-AdUnitWithTdAds","id":"Col1-2-AdUnitWithTdAds"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-fsrvy","positions":["FSRVY"],"key":"Col1-3-AdUnitWithTdAds","id":"Col1-3-AdUnitWithTdAds"},"isPageComposite":true}],"Col2":[{"bundleName":"td-app-finance","name":"ExtPromoButton","props":{"className":"btn Bds(s) Bdc($c-fuji-grey-c) Bdrs(4px) Bgc($white) Bdw(1px) Bgc($ExtButtonHov):h C($white):h C($ExtButtonHov) Cur(p) Fz(s) Fw(b) H(44px) Lh(40px) Mb(20px) Ta(c) Td(n) W(100%)","sec":"ext-promo-all-mkt-submit","titleId":"EXTENSION_PROMO_TITLE","url":"https:\u002F\u002Fchrome.google.com\u002Fwebstore\u002Fdetail\u002Fdoojmkhhplhicnghmafjbhncmgjiohma","enabled":true,"key":"Col2-0-ExtPromoButton","id":"Col2-0-ExtPromoButton"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"eventPromo","key":"Col2-1-QuoteModule","id":"Col2-1-QuoteModule"},"isPageComposite":true},{"bundleName":"td-ads","name":"ComboAd","props":{"adparseStyle":{"marginBottom":"20px"},"finishedStyle":{"marginBottom":"20px"},"children":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LREC"}},{"bundleName":"td-ads","name":"Ad","props":{"pos":"MON"}}],"serverHeight":true,"key":"Col2-2-ComboAd","id":"Col2-2-ComboAd"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"similarCompanies","key":"Col2-3-QuoteModule","id":"Col2-3-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"earningsChart","key":"Col2-4-QuoteModule","id":"Col2-4-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"financialsChart","key":"Col2-5-QuoteModule","id":"Col2-5-QuoteModule"},"initMode":{"deferRender":true},"isPageComposite":true},{"bundleName":"react-finance",..."}}}};
这是我得到的 Android 输出;
(root.App.main = {"context":{"dispatcher":{"stores":{"PageStore":{"currentPageName":"quote","currentRenderTargetId":"default","pagesConfigRaw":{"base":{"quote":{"layout":{"bundleName":"yahoodotcom-layout.TwoColumnLayout","name":"TwoColumnLayout","config":{"enableHeaderCollapse":true,"Header":{"isFixed":true,"uhContainerClasses":"Bgi($uhGrayGradient)","navContainerClasses":"Bgi($navrailGrayGradient) Bxsh($navrailShadow) Pos(r) hasScrolled_Bxsh(headerShadow) Panel-open_Bxsh(headerShadow)","navTransitionClasses":"HideNavrail_Translate3d(0,-46px,0) Panel-open_Translate3d(0,-46px,0)","secondaryNavContainerClasses":"hasScrolled_Bdbw(0px) Bxsh($navrailShadow)","height":135},"fetchNewAttribution":true},"meta":{"property":{"twitter:site":"@YahooFinance"}}},"meta":{"property":{"twitter:site":"@YahooFinance","fb:pages":"90376669494"}},"regions":{"SecondaryNav":[{"bundleName":"react-finance","name":"SecondaryNav","config":{"ui":{"enableRelativeUrl":true}},"props":{"key":"SecondaryNav-0-SecondaryNav","id":"SecondaryNav-0-SecondaryNav"},"isPageComposite":true}],"Overlay":[{"bundleName":"react-lightbox","name":"Lightbox","props":{"key":"Overlay-0-Lightbox","id":"Overlay-0-Lightbox"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-1-Null","id":"Overlay-1-Null"},"isPageComposite":true},{"bundleName":"td-app-finance","name":"Null","props":{"key":"Overlay-2-Null","id":"Overlay-2-Null"},"isPageComposite":true}],"Lead":[{"bundleName":"react-finance","name":"FinanceHeader","props":{"className":"Bxz(bb) H(100%) Pos(r) Maw($newGridWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mstart(a) Mend(a) Px(20px) My(10px)","showAds":true,"adsConfig":{"positions":["FB2A","FB2B","FB2C","FB2D"]},"key":"Lead-0-FinanceHeader","id":"Lead-0-FinanceHeader"},"isPageComposite":true},{"bundleName":"tdv2-applet-featurebar","name":"FeatureBar","config":{"ui":{"container_classnames":"W(100%) Bxz(bb) Bdrs(2px) Mb(10px) Maw($maxModuleWidth) Miw($minGridWidth) Miw(a)!--tab768 Miw(a)!--tab1024 Mx(a)","prerender":{"enabled":true,"renderTargetId":"modal"}},"site":"finance"},"props":{"key":"Lead-1-FeatureBar","id":"Lead-1-FeatureBar"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteHeader","props":{"key":"Lead-2-QuoteHeader","id":"Lead-2-QuoteHeader"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteNav","props":{"key":"Lead-3-QuoteNav","id":"Lead-3-QuoteNav"},"isPageComposite":true}],"Col1":[{"bundleName":"td-ads","name":"Ad","props":{"pos":"LDRB","style":{"marginBottom":"8px","paddingTop":"0px","marginLeft":"auto","marginRight":"auto","textAlign":"center","lineHeight":"0px","position":"relative","zIndex":"5"},"key":"Col1-0-Ad","id":"Col1-0-Ad"},"isPageComposite":true},{"bundleName":"Quote.financials","name":"Financials","props":{"key":"Col1-1-Financials","id":"Col1-1-Financials"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-foot","positions":["FOOT"],"key":"Col1-2-AdUnitWithTdAds","id":"Col1-2-AdUnitWithTdAds"},"isPageComposite":true},{"bundleName":"react-finance","name":"AdUnitWithTdAds","props":{"className":"ad-fsrvy","positions":["FSRVY"],"key":"Col1-3-AdUnitWithTdAds","id":"Col1-3-AdUnitWithTdAds"},"isPageComposite":true}],"Col2":[{"bundleName":"td-app-finance","name":"ExtPromoButton","props":{"className":"btn Bds(s) Bdc($c-fuji-grey-c) Bdrs(4px) Bgc($white) Bdw(1px) Bgc($ExtButtonHov):h C($white):h C($ExtButtonHov) Cur(p) Fz(s) Fw(b) H(44px) Lh(40px) Mb(20px) Ta(c) Td(n) W(100%)","sec":"ext-promo-all-mkt-submit","titleId":"EXTENSION_PROMO_TITLE","url":"https:\u002F\u002Fchrome.google.com\u002Fwebstore\u002Fdetail\u002Fdoojmkhhplhicnghmafjbhncmgjiohma","enabled":true,"key":"Col2-0-ExtPromoButton","id":"Col2-0-ExtPromoButton"},"isPageComposite":true},{"bundleName":"QuotePage","name":"QuoteModule","props":{"type":"eventPromo","key":"Col2-1-QuoteModule","id":"Col2-1-QuoteModule"},"isPageComposite":true}
你有什么建议吗?谢谢。 我的代码;
Document doc = Jsoup.connect(requestURL).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.43")
.timeout(600000).get();
Elements tableDivs = doc.getElementsByAttributeValue("class", myClassName);
Elements scriptTags = doc.getElementsByTag("script");
for (Element script : scriptTags) {
//System.out.println(script.data());
Log.e("ONE", script.data());
}
Yahoo Finance 重定向到 guce.oath.com,通知我们有关 cookie 和其他数据的使用,并要求在提供内容之前单击 'accept'。如果我们清除 cokies 并刷新页面,我们也可以在浏览器中观察到这一点。
我们可以从 guce.oath.com 中抓取 link,但我注意到最后的 URL 有一个 guccounter=2
参数,如果我们使用那个 URL 我们可以得到所需的响应。
String requestURL = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL&guccounter=2";
String userAgent = "My UAString";
Document doc = Jsoup.connect(requestURL).userAgent(userAgent).get();
由于数据不是HTML而是JavaScript代码,我们不能用jsoup
解析,但是可以用正则表达式。
Elements scriptTags = doc.getElementsByTag("script");
String re = "root\.App\.main\s*\=\s*(.*?);\s*\}\(this\)\)\s*;";
String data = null;
for (Element script : scriptTags) {
Pattern pattern = Pattern.compile(re, Pattern.DOTALL);
Matcher matcher = pattern.matcher(script.html());
if (matcher.find()) {
data = matcher.group(1);
break;
}
}
data
字符串应包含来自 JavaScript 代码的字典,这是一个有效的 json 字符串,可以用 JSONObject
解析。
然而,在 Android Studio 上,据我所知没有重定向。我试过几个用户代理字符串,但页面似乎是直接加载的。尽管如此,包含数据的 JavaScript 字典仍然存在,我们可以提取它,并用 JSONObject
解析它。
Android 工作室代码:
String requestURL = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL";
String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.43";
String row = "totalRevenue";
try {
Document doc = Jsoup.connect(requestURL).userAgent(userAgent).get();
String html = doc.html();
//Log.d("html", html);
Elements scriptTags = doc.getElementsByTag("script");
String re = "root\.App\.main\s*\=\s*(.*?);\s*\}\(this\)\)\s*;";
for (Element script : scriptTags) {
Pattern pattern = Pattern.compile(re, Pattern.DOTALL);
Matcher matcher = pattern.matcher(script.html());
if (matcher.find()) {
String data = matcher.group(1);
//Log.d("data", data);
JSONObject jo = new JSONObject(data);
JSONArray table = getTable(jo);
//Log.d("table", table.toString());
String[] tableRow = getRow(table, row);
String values = TextUtils.join(", ", tableRow);
Log.d("values", values);
}
}
} catch (Exception e) {
Log.e("err", "err", e);
}
这应该解析数据和 select 'Total Revenue' 值。我使用的 getTable
和 getRow
方法:
private JSONArray getTable(JSONObject json) throws JSONException {
JSONArray table = (JSONArray) json.getJSONObject("context")
.getJSONObject("dispatcher")
.getJSONObject("stores")
.getJSONObject("QuoteSummaryStore")
.getJSONObject("incomeStatementHistoryQuarterly")
.getJSONArray("incomeStatementHistory");
return table;
}
private String[] getRow(JSONArray table, String name) throws JSONException {
String[] values = new String[table.length()];
for (int i = 0; i < table.length(); i++) {
JSONObject jo = table.getJSONObject(i);
if (jo.has(name)) {
jo = jo.getJSONObject(name);
values[i] = jo.has("longFmt") ? jo.get("longFmt").toString() : "-";
} else {
values[i] = "-";
}
}
return values;
}
private String[] getDates(JSONArray table) throws JSONException {
String[] values = new String[table.length()];
for (int i = 0; i < table.length(); i++) {
values[i] = table.getJSONObject(i).getJSONObject("endDate")
.get("fmt").toString();
}
return values;
}
我认为获取 table 数据的最佳方法是将每个 html 行名称映射到一个 json 键。此外,主 table 有五个子 table,因此我们可以将每个嵌套的 table 映射到它包含的行。
Map<String, Map<String, String>> getTableNames() {
final Map<String, String> revenue = new LinkedHashMap<String, String>() {
{ put("Total Revenue", "totalRevenue"); }
{ put("Cost of Revenue", "costOfRevenue"); }
{ put("Gross Profit", "grossProfit"); }
};
final Map<String, String> operatingExpenses = new LinkedHashMap<String, String>() {
{ put("Research Development", "researchDevelopment"); }
{ put("Selling General and Administrative", "sellingGeneralAdministrative"); }
{ put("Non Recurring", "nonRecurring"); }
{ put("Others", "otherOperatingExpenses"); }
{ put("Total Operating Expenses", "totalOperatingExpenses"); }
{ put("Operating Income or Loss", "operatingIncome"); }
};
Map<String, Map<String, String>> allTableNames = new LinkedHashMap<String, Map<String, String>>() {
{ put("Revenue", revenue); }
{ put("Operating Expenses", operatingExpenses); }
};
return allTableNames;
}
我们可以将此映射用于 select 单个单元格,例如 6/30/2018 的 'Total Revenue'(位于第一行和第一列),
JSONObject jo = new JSONObject(jsData);
JSONArray table = getTable(jo);
Map<String, Map<String, String>> tableNames = getTableNames();
String totalRevenueKey = tableNames.get("Revenue").get("Total Revenue");
String[] totalRevenueValues = getRow(table, totalRevenueKey);
String value = totalRevenueValues[0];
或者我们可以遍历 table 名称并构建一个列表或字符串,其中包含所有 table 数据。
List<String> tableData = new ArrayList<>();
Map<String, Map<String, String>> tableNames = getTableNames();
String[] dates = getDates(table);
for (Map.Entry<String, Map<String, String>> tableEntry : tableNames.entrySet()) {
tableData.add(tableEntry.getKey());
tableData.addAll(Arrays.asList(dates));
for (Map.Entry<String, String> row : tableEntry.getValue().entrySet()) {
String[] tableRow = getRow(table, row.getValue());
tableData.add(row.getKey());
for (String column: tableRow) {
tableData.add(column);
}
}
}
String tableDataString = TextUtils.join(", ", tableData);
我已尝试尽可能匹配 html table,因此 tableData
列表和结果字符串的格式为 "table name, date,date,date,date" 和 "row name, price,price,price,price",但最好只包含数字。 (在这种情况下,我们应该只添加 tableRow
项到 tableData
)