Java 用于 ping 数千个 url 的程序
Java program to ping thousands of urls
我想编写一个程序来 ping 至少 10000 个 url。我写了一个小程序,发现并没有我想象的那么快
Ping 100 个网址需要 3-4 分钟。
有人有更好的建议吗?
private static Map<String, String> findUnreachableUrls(Set<String> urls) {
Map<String, String> badUrls = new TreeMap<>();
for (String url : urls) {
HttpURLConnection connection;
try {
connection = (HttpURLConnection) new URL(url).openConnection();
connection.setRequestMethod("HEAD");
connection.connect();
int responseCode = connection.getResponseCode();
if (responseCode != 200 && responseCode != 302) {
badUrls.put(url, Integer.toString(responseCode));
}
} catch (IOException e) {
badUrls.put(url, e.getMessage());
}
}
return badUrls;
}
您应该使用并行线程,例如 5 个线程对 20 个 URL 执行相同的处理并最终聚合结果。这将使结果更快。最简单的解决方案是使用 Java 8 条流并行处理 URL。以下是相同的示例程序:
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
public class Main {
public static void main(String[] args) {
Set<String> urlSet = new HashSet<>();
//Populate your set with the Strings
findUnreachableUrls(urlSet);
}
private static Map<String, String> findUnreachableUrls(Set<String> urls) {
Map<String, String> badUrls = new TreeMap<>();
urls.parallelStream().forEach(
url->{
badUrls.put(url,checkUrl(url));
}
);
return badUrls;
}
private static String checkUrl(String url)
{
HttpURLConnection connection;
String returnCode="";
try {
connection = (HttpURLConnection) new URL(url).openConnection();
connection.setRequestMethod("HEAD");
connection.connect();
int responseCode = connection.getResponseCode();
if (responseCode != 200 || responseCode != 302) {
returnCode=responseCode+"";
}
}catch(IOException e)
{
returnCode=e.getMessage();
}
return returnCode;
}
}
正如我所写的那样,我会这样做(未测试)。也许如果只有几个主机,而 url 很多,分组的 url 应该进一步拆分。
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class TestURLs implements Function<String, Optional<TestURLs.Tuple>> {
public static final int TIMEOUT = 3000;
public class Tuple {
final String url;
final String error;
public Tuple(String url, String error) {
this.url = url;
this.error = error;
}
}
public static enum HostNamePortExtractor implements Function<String, String>{
INSTANCE;
@Override
public String apply(String url) {
try {
URL u = new URL(url);
return u.getHost() + u.getPort();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
@Override
public Optional<Tuple> apply(String url) {
HttpURLConnection connection;
try {
connection = (HttpURLConnection) new URL(url).openConnection();
connection.setRequestMethod("HEAD");
connection.setReadTimeout(TIMEOUT);
connection.setConnectTimeout(TIMEOUT);
connection.connect();
int responseCode = connection.getResponseCode();
// are you sure? I think you would have liked to write here "and" not or
//if (responseCode != 200 || responseCode != 302) {
if (responseCode != 200 && responseCode != 302) {
return Optional.of(new Tuple(url, Integer.toString(responseCode)));
}
} catch (IOException e) {
return Optional.of(new Tuple(url, e.getMessage()));
}
return Optional.empty();
}
public Map<String, String> process() {
List<String> URLs = new ArrayList<>(); // add urls here
// group by hostname+port
Map<String, List<String>> groupedUrls = URLs.stream().collect(Collectors.groupingBy(HostNamePortExtractor.INSTANCE));
Stream<Tuple> errors = groupedUrls.keySet().parallelStream()
// I am not fully sure, but hoping that the stream() will go to the same thread
.flatMap(host -> groupedUrls.get(host).stream())
// go to the server
.map(this::apply)
// if there was no error, filter out the optional.empties
.filter(o -> o.isPresent())
// get the Tuple with url and the error
.map(o -> o.get());
// make a map
return errors.collect(Collectors.toMap(t -> t.url, t -> t.error));
}
public static void main(String[] args) {
TestURLs testUrls = new TestURLs();
testUrls.process().entrySet().forEach(e -> {
System.out.println(e.getKey() + " error: " + e.getValue());
});
}
}
为了完整起见,我是这样实现的。
private static Map<String, String> findUnreachableUrls1(Set<String> urls) {
Predicate<String> unPingableUrlPred = x -> !(x.equals("200") || x.equals("302"));
Map<String, String> badUrls = urls.parallelStream().map(url -> pingUrl(url))
.filter(x -> unPingableUrlPred.test(x.t)).collect(Collectors.toConcurrentMap(x -> x.s, x -> x.t));
return badUrls;
}
static Pair<String, String> pingUrl(String url) {
Pair<String, String> urlResponse = new Pair<>();
urlResponse.setKey(url);
HttpURLConnection connection;
try {
connection = (HttpURLConnection) new URL(url).openConnection();
connection.setConnectTimeout(5000);
connection.setReadTimeout(5000);
connection.setRequestMethod("HEAD");
connection.connect();
int responseCode = connection.getResponseCode();
urlResponse.setValue(Integer.toString(responseCode));
} catch (IOException e) {
urlResponse.setValue(e.getMessage());
}
return urlResponse;
}
我想编写一个程序来 ping 至少 10000 个 url。我写了一个小程序,发现并没有我想象的那么快
Ping 100 个网址需要 3-4 分钟。
有人有更好的建议吗?
private static Map<String, String> findUnreachableUrls(Set<String> urls) {
Map<String, String> badUrls = new TreeMap<>();
for (String url : urls) {
HttpURLConnection connection;
try {
connection = (HttpURLConnection) new URL(url).openConnection();
connection.setRequestMethod("HEAD");
connection.connect();
int responseCode = connection.getResponseCode();
if (responseCode != 200 && responseCode != 302) {
badUrls.put(url, Integer.toString(responseCode));
}
} catch (IOException e) {
badUrls.put(url, e.getMessage());
}
}
return badUrls;
}
您应该使用并行线程,例如 5 个线程对 20 个 URL 执行相同的处理并最终聚合结果。这将使结果更快。最简单的解决方案是使用 Java 8 条流并行处理 URL。以下是相同的示例程序:
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
public class Main {
public static void main(String[] args) {
Set<String> urlSet = new HashSet<>();
//Populate your set with the Strings
findUnreachableUrls(urlSet);
}
private static Map<String, String> findUnreachableUrls(Set<String> urls) {
Map<String, String> badUrls = new TreeMap<>();
urls.parallelStream().forEach(
url->{
badUrls.put(url,checkUrl(url));
}
);
return badUrls;
}
private static String checkUrl(String url)
{
HttpURLConnection connection;
String returnCode="";
try {
connection = (HttpURLConnection) new URL(url).openConnection();
connection.setRequestMethod("HEAD");
connection.connect();
int responseCode = connection.getResponseCode();
if (responseCode != 200 || responseCode != 302) {
returnCode=responseCode+"";
}
}catch(IOException e)
{
returnCode=e.getMessage();
}
return returnCode;
}
}
正如我所写的那样,我会这样做(未测试)。也许如果只有几个主机,而 url 很多,分组的 url 应该进一步拆分。
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class TestURLs implements Function<String, Optional<TestURLs.Tuple>> {
public static final int TIMEOUT = 3000;
public class Tuple {
final String url;
final String error;
public Tuple(String url, String error) {
this.url = url;
this.error = error;
}
}
public static enum HostNamePortExtractor implements Function<String, String>{
INSTANCE;
@Override
public String apply(String url) {
try {
URL u = new URL(url);
return u.getHost() + u.getPort();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
@Override
public Optional<Tuple> apply(String url) {
HttpURLConnection connection;
try {
connection = (HttpURLConnection) new URL(url).openConnection();
connection.setRequestMethod("HEAD");
connection.setReadTimeout(TIMEOUT);
connection.setConnectTimeout(TIMEOUT);
connection.connect();
int responseCode = connection.getResponseCode();
// are you sure? I think you would have liked to write here "and" not or
//if (responseCode != 200 || responseCode != 302) {
if (responseCode != 200 && responseCode != 302) {
return Optional.of(new Tuple(url, Integer.toString(responseCode)));
}
} catch (IOException e) {
return Optional.of(new Tuple(url, e.getMessage()));
}
return Optional.empty();
}
public Map<String, String> process() {
List<String> URLs = new ArrayList<>(); // add urls here
// group by hostname+port
Map<String, List<String>> groupedUrls = URLs.stream().collect(Collectors.groupingBy(HostNamePortExtractor.INSTANCE));
Stream<Tuple> errors = groupedUrls.keySet().parallelStream()
// I am not fully sure, but hoping that the stream() will go to the same thread
.flatMap(host -> groupedUrls.get(host).stream())
// go to the server
.map(this::apply)
// if there was no error, filter out the optional.empties
.filter(o -> o.isPresent())
// get the Tuple with url and the error
.map(o -> o.get());
// make a map
return errors.collect(Collectors.toMap(t -> t.url, t -> t.error));
}
public static void main(String[] args) {
TestURLs testUrls = new TestURLs();
testUrls.process().entrySet().forEach(e -> {
System.out.println(e.getKey() + " error: " + e.getValue());
});
}
}
为了完整起见,我是这样实现的。
private static Map<String, String> findUnreachableUrls1(Set<String> urls) {
Predicate<String> unPingableUrlPred = x -> !(x.equals("200") || x.equals("302"));
Map<String, String> badUrls = urls.parallelStream().map(url -> pingUrl(url))
.filter(x -> unPingableUrlPred.test(x.t)).collect(Collectors.toConcurrentMap(x -> x.s, x -> x.t));
return badUrls;
}
static Pair<String, String> pingUrl(String url) {
Pair<String, String> urlResponse = new Pair<>();
urlResponse.setKey(url);
HttpURLConnection connection;
try {
connection = (HttpURLConnection) new URL(url).openConnection();
connection.setConnectTimeout(5000);
connection.setReadTimeout(5000);
connection.setRequestMethod("HEAD");
connection.connect();
int responseCode = connection.getResponseCode();
urlResponse.setValue(Integer.toString(responseCode));
} catch (IOException e) {
urlResponse.setValue(e.getMessage());
}
return urlResponse;
}