如何通过 java 使用 JSOUP 抓取和下载 HTML 页面中的 table
How to scrape and download the table in HTML page using JSOUP through java
我试过了...但是行和列是分开打印的...我的要求是从 HTML 页面
下载 table
public class Main {
public static void main(String[] args) throws IOException {
String html = "URL";
// Document doc = Jsoup.connect(html).get();
Document doc = Jsoup.parse(html);
System.out.println(doc);
Elements tableElements = doc.select("table");
Elements tableHeaderEles = tableElements.select("thead tr th");
System.out.println("headers");
for (int i = 0; i < tableHeaderEles.size(); i++) {
System.out.println(tableHeaderEles.get(i).text());
}
System.out.println();
Elements tableRowElements = tableElements.select(":not(thead) tr");
for (int i = 0; i < tableRowElements.size(); i++) {
Element row = tableRowElements.get(i);
System.out.println("row");
Elements rowItems = row.select("td");
for (int j = 0; j < rowItems.size(); j++) {
System.out.println(rowItems.get(j).text());
}
}
}
}
提前致谢...
使用 luksch wrote in his comment 的教程,解决方案可能是:
package com.github.davidepastore.Whosebug34331254;
import java.io.FileWriter;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Reply to Whosebug 34331254 question.
*
*/
public class App {
public static void main(String[] args) throws IOException {
String url = "http://www.htmlcodetutorial.com/tables/_THEAD.html";
String fileName = "table.csv";
FileWriter writer = new FileWriter(fileName);
Document doc = Jsoup.connect(url).get();
System.out.println(doc);
Element tableElement = doc.select("table").first();
Elements tableHeaderEles = tableElement.select("thead tr th");
System.out.println("headers");
for (int i = 0; i < tableHeaderEles.size(); i++) {
System.out.println(tableHeaderEles.get(i).text());
writer.append(tableHeaderEles.get(i).text());
if(i != tableHeaderEles.size() -1){
writer.append(',');
}
}
writer.append('\n');
System.out.println();
Elements tableRowElements = tableElement.select(":not(thead) tr");
for (int i = 0; i < tableRowElements.size(); i++) {
Element row = tableRowElements.get(i);
System.out.println("row");
Elements rowItems = row.select("td");
for (int j = 0; j < rowItems.size(); j++) {
System.out.println(rowItems.get(j).text());
writer.append(rowItems.get(j).text());
if(j != rowItems.size() -1){
writer.append(',');
}
}
writer.append('\n');
}
writer.close();
}
}
这将创建一个 csv
文件,其中将包含第一个 table(包括 headers)。
table内容为:
<table cellpadding="6" rules="GROUPS" frame="BOX">
<thead>
<tr>
<th>Weekday</th>
<th>Date</th>
<th>Manager</th>
<th>Qty</th>
</tr>
</thead>
<tbody>
<tr>
<td>Mon</td>
<td>09/11</td>
<td>Kelsey</td>
<td>639</td>
</tr>
<tr>
<td>Tue</td>
<td>09/12</td>
<td>Lindsey</td>
<td>596</td>
</tr>
<tr>
<td>Wed</td>
<td>09/13</td>
<td>Randy</td>
<td>1135</td>
</tr>
<tr>
<td>Thu</td>
<td>09/14</td>
<td>Susan</td>
<td>1002</td>
</tr>
<tr>
<td>Fri</td>
<td>09/15</td>
<td>Randy</td>
<td>908</td>
</tr>
<tr>
<td>Sat</td>
<td>09/16</td>
<td>Lindsey</td>
<td>371</td>
</tr>
<tr>
<td>Sun</td>
<td>09/17</td>
<td>Susan</td>
<td>272</td>
</tr>
</tbody>
<tfoot>
<tr>
<th align="LEFT" colspan="3">Total</th>
<th>4923</th>
</tr>
</tfoot>
</table>
csv
输出将是:
Weekday,Date,Manager,Qty
Mon,09/11,Kelsey,639
Tue,09/12,Lindsey,596
Wed,09/13,Randy,1135
Thu,09/14,Susan,1002
Fri,09/15,Randy,908
Sat,09/16,Lindsey,371
Sun,09/17,Susan,272
更新
The table that you're talking about is loaded from another url: http://factfinder.census.gov/tablerestful/tableServices/renderProductData?renderForMap=f&renderForChart=f&pid=PEP_2014_PEPANNRES&src=pt&log=t&_ts=468903667318
它包含一个 productDataTable
属性 和 table 内容。
我试过了...但是行和列是分开打印的...我的要求是从 HTML 页面
下载 tablepublic class Main {
public static void main(String[] args) throws IOException {
String html = "URL";
// Document doc = Jsoup.connect(html).get();
Document doc = Jsoup.parse(html);
System.out.println(doc);
Elements tableElements = doc.select("table");
Elements tableHeaderEles = tableElements.select("thead tr th");
System.out.println("headers");
for (int i = 0; i < tableHeaderEles.size(); i++) {
System.out.println(tableHeaderEles.get(i).text());
}
System.out.println();
Elements tableRowElements = tableElements.select(":not(thead) tr");
for (int i = 0; i < tableRowElements.size(); i++) {
Element row = tableRowElements.get(i);
System.out.println("row");
Elements rowItems = row.select("td");
for (int j = 0; j < rowItems.size(); j++) {
System.out.println(rowItems.get(j).text());
}
}
}
}
提前致谢...
使用 luksch wrote in his comment 的教程,解决方案可能是:
package com.github.davidepastore.Whosebug34331254;
import java.io.FileWriter;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Reply to Whosebug 34331254 question.
*
*/
public class App {
public static void main(String[] args) throws IOException {
String url = "http://www.htmlcodetutorial.com/tables/_THEAD.html";
String fileName = "table.csv";
FileWriter writer = new FileWriter(fileName);
Document doc = Jsoup.connect(url).get();
System.out.println(doc);
Element tableElement = doc.select("table").first();
Elements tableHeaderEles = tableElement.select("thead tr th");
System.out.println("headers");
for (int i = 0; i < tableHeaderEles.size(); i++) {
System.out.println(tableHeaderEles.get(i).text());
writer.append(tableHeaderEles.get(i).text());
if(i != tableHeaderEles.size() -1){
writer.append(',');
}
}
writer.append('\n');
System.out.println();
Elements tableRowElements = tableElement.select(":not(thead) tr");
for (int i = 0; i < tableRowElements.size(); i++) {
Element row = tableRowElements.get(i);
System.out.println("row");
Elements rowItems = row.select("td");
for (int j = 0; j < rowItems.size(); j++) {
System.out.println(rowItems.get(j).text());
writer.append(rowItems.get(j).text());
if(j != rowItems.size() -1){
writer.append(',');
}
}
writer.append('\n');
}
writer.close();
}
}
这将创建一个 csv
文件,其中将包含第一个 table(包括 headers)。
table内容为:
<table cellpadding="6" rules="GROUPS" frame="BOX">
<thead>
<tr>
<th>Weekday</th>
<th>Date</th>
<th>Manager</th>
<th>Qty</th>
</tr>
</thead>
<tbody>
<tr>
<td>Mon</td>
<td>09/11</td>
<td>Kelsey</td>
<td>639</td>
</tr>
<tr>
<td>Tue</td>
<td>09/12</td>
<td>Lindsey</td>
<td>596</td>
</tr>
<tr>
<td>Wed</td>
<td>09/13</td>
<td>Randy</td>
<td>1135</td>
</tr>
<tr>
<td>Thu</td>
<td>09/14</td>
<td>Susan</td>
<td>1002</td>
</tr>
<tr>
<td>Fri</td>
<td>09/15</td>
<td>Randy</td>
<td>908</td>
</tr>
<tr>
<td>Sat</td>
<td>09/16</td>
<td>Lindsey</td>
<td>371</td>
</tr>
<tr>
<td>Sun</td>
<td>09/17</td>
<td>Susan</td>
<td>272</td>
</tr>
</tbody>
<tfoot>
<tr>
<th align="LEFT" colspan="3">Total</th>
<th>4923</th>
</tr>
</tfoot>
</table>
csv
输出将是:
Weekday,Date,Manager,Qty
Mon,09/11,Kelsey,639
Tue,09/12,Lindsey,596
Wed,09/13,Randy,1135
Thu,09/14,Susan,1002
Fri,09/15,Randy,908
Sat,09/16,Lindsey,371
Sun,09/17,Susan,272
更新
The table that you're talking about is loaded from another url: http://factfinder.census.gov/tablerestful/tableServices/renderProductData?renderForMap=f&renderForChart=f&pid=PEP_2014_PEPANNRES&src=pt&log=t&_ts=468903667318
它包含一个 productDataTable
属性 和 table 内容。