核心类(非多线程):
package com.magus.datahub.master.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* @Description
* @Author lxk
* @Date 2019/12/18 0018 15:17
* @Version V1.0
*/
public class JsoupController {
public static void main(String[] args) {
try {
long start = System.currentTimeMillis();
File input = new File("C:/Users/Administrator/Desktop/AA.HTML");
Document document = Jsoup.parse(input, "UTF-8", "http://example.com/");
Element tbody = document.getElementsByTag("tbody").first();
Elements trs = tbody.select("tr");
List<JsoupUser> users = new ArrayList<>();
for (int i = 2; i < trs.size(); i++) {
Element tr = trs.get(i);
JsoupUser user = new JsoupUser();
Elements tds = tr.select("td");
for (int j = 0; j < tds.size(); j++) {
Element td = tds.get(j);
String text = td.text();
if (j == 0) {
user.setId(text);
}
if (j == 1) {
user.setCode(text);
}
if (j == 2) {
user.setName(text);
}
if (j == 3) {
user.setOrg(text);
}
if (j == 4) {
user.setMobile(text);
}
if (j == 5) {
user.setPhone(text);
}
if (j == 6) {
user.setOrder(text);
}
}
System.out.println(user.getId() + " " + user.getCode() + " " + user.getName() +
" " + user.getOrg() + " " + user.getMobile() + " " + user.getPhone() + " " + user.getOrder() + " \n");
users.add(user);
}
long end = System.currentTimeMillis();
System.out.println("执行任务消耗了:" + (end - start));
} catch (Exception e) {
e.printStackTrace();
}
}
}
核心类(多线程):
package com.magus.datahub.master.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
/**
* @Description
* @Author lxk
* @Date 2019/12/18 0018 15:17
* @Version V1.0
*/
public class JsoupThreadController {
public static void main(String[] args) {
try {
long start = System.currentTimeMillis();
File input = new File("C:/Users/Administrator/Desktop/AA.HTML");
Document document = Jsoup.parse(input, "UTF-8", "http://example.com/");
Element tbody = document.getElementsByTag("tbody").first();
Elements trs = tbody.select("tr");
//去除标题和表头
for (int t = 0; t < 2; t++) {
trs.remove(0);
}
// 每500条数据开启一条线程
int threadSize = 500;
// 总数据条数
int dataSize = trs.size();
// 线程数
int threadNum = dataSize / threadSize + 1;
// 定义标记,过滤threadNum为整数
boolean special = dataSize % threadSize == 0;
// 创建一个线程池
ExecutorService exec = Executors.newFixedThreadPool(threadNum);
// 定义一个任务集合
List<Callable<List<JsoupUser>>> tasks = new ArrayList<Callable<List<JsoupUser>>>();
List<Element> cutList = null;
for (int i = 0; i < threadNum; i++) {
if (i == threadNum - 1) {
if (special) {
break;
}
cutList = trs.subList(threadSize * i, dataSize);
} else {
cutList = trs.subList(threadSize * i, threadSize * (i + 1));
}
final List<Element> paramList = cutList;
Callable<List<JsoupUser>> task = new Callable<List<JsoupUser>>() {
public List<JsoupUser> call() throws Exception {
List<JsoupUser> users = new ArrayList<>();
for (int j = 0; j < paramList.size(); j++) {
Element tr = paramList.get(j);
JsoupUser user = new JsoupUser();
Elements tds = tr.select("td");
for (int k = 0; k < tds.size(); k++) {
Element td = tds.get(k);
String text = td.text();
if (k == 0) {
user.setId(text);
}
if (k == 1) {
user.setCode(text);
}
if (k == 2) {
user.setName(text);
}
if (k == 3) {
user.setOrg(text);
}
if (k == 4) {
user.setMobile(text);
}
if (k == 5) {
user.setPhone(text);
}
if (k == 6) {
user.setOrder(text);
}
}
users.add(user);
}
return users;
}
};
exec.submit(task);
tasks.add(task);
}
List<Future<List<JsoupUser>>> results = exec.invokeAll(tasks);
for (Future<List<JsoupUser>> future : results) {
future.get().forEach(u -> {
System.out.println(u.getId() + " " + u.getCode() + " " + u.getName() + " " + u.getOrg() + " " + u.getMobile() + " " + u.getPhone() + " " + u.getOrder() + " \n");
});
}
exec.shutdown();
long end = System.currentTimeMillis();
System.out.println("执行任务消耗了:" + (end - start));
} catch (Exception e) {
e.printStackTrace();
}
}
}
解析为目标对象:
package com.magus.datahub.master.jsoup;
/**
* @Description
* @Author lxk
* @Date 2019/12/18 0018 15:54
* @Version V1.0
*/
public class JsoupUser {
private String id;
private String code;
private String name;
private String org;
private String mobile;
private String phone;
private String order;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getOrg() {
return org;
}
public void setOrg(String org) {
this.org = org;
}
public String getMobile() {
return mobile;
}
public void setMobile(String mobile) {
this.mobile = mobile;
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public String getOrder() {
return order;
}
public void setOrder(String order) {
this.order = order;
}
}
爬虫解析百度:
package com.magus.datahub.master.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
/**
* @Description
* @Author lxk
* @Date 2019/12/18 0018 15:17
* @Version V1.0
*/
public class JsoupUrlController {
public static void main(String[] args) {
try{
long start = System.currentTimeMillis();
Document document = Jsoup.connect("https://www.baidu.com/").get();
Elements elements = document.getElementsByTag("form");
Elements spans=elements.first().select("span");
Elements s = spans.eq(1).select("input");
System.out.println(s.first().attr("value"));
long end = System.currentTimeMillis();
System.out.println("执行任务消耗了:"+(end-start));
} catch (Exception e) {
e.printStackTrace();
}
}
}
因篇幅问题不能全部显示,请点此查看更多更全内容