搜索
您的当前位置:首页正文

Jsoup网络爬虫,解析本地html文件

来源:步旅网

核心类(非多线程):

package com.magus.datahub.master.jsoup;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

/**
 * @Description
 * @Author lxk
 * @Date 2019/12/18 0018 15:17
 * @Version V1.0
 */
public class JsoupController {

    public static void main(String[] args) {
        try {
            long start = System.currentTimeMillis();

            File input = new File("C:/Users/Administrator/Desktop/AA.HTML");
            Document document = Jsoup.parse(input, "UTF-8", "http://example.com/");
            Element tbody = document.getElementsByTag("tbody").first();
            Elements trs = tbody.select("tr");
            List<JsoupUser> users = new ArrayList<>();
            for (int i = 2; i < trs.size(); i++) {
                Element tr = trs.get(i);
                JsoupUser user = new JsoupUser();
                Elements tds = tr.select("td");
                for (int j = 0; j < tds.size(); j++) {
                    Element td = tds.get(j);
                    String text = td.text();
                    if (j == 0) {
                        user.setId(text);
                    }
                    if (j == 1) {
                        user.setCode(text);
                    }
                    if (j == 2) {
                        user.setName(text);
                    }
                    if (j == 3) {
                        user.setOrg(text);
                    }
                    if (j == 4) {
                        user.setMobile(text);
                    }
                    if (j == 5) {
                        user.setPhone(text);
                    }
                    if (j == 6) {
                        user.setOrder(text);
                    }
                }
                System.out.println(user.getId() + " " + user.getCode() + " " + user.getName() +
                        " " + user.getOrg() + " " + user.getMobile() + " " + user.getPhone() + " " + user.getOrder() + " \n");
                users.add(user);
            }

            long end = System.currentTimeMillis();
            System.out.println("执行任务消耗了:" + (end - start));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }


}

核心类(多线程):

package com.magus.datahub.master.jsoup;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

/**
 * @Description
 * @Author lxk
 * @Date 2019/12/18 0018 15:17
 * @Version V1.0
 */
public class JsoupThreadController {

    public static void main(String[] args) {
        try {
            long start = System.currentTimeMillis();

            File input = new File("C:/Users/Administrator/Desktop/AA.HTML");
            Document document = Jsoup.parse(input, "UTF-8", "http://example.com/");
            Element tbody = document.getElementsByTag("tbody").first();
            Elements trs = tbody.select("tr");
            //去除标题和表头
            for (int t = 0; t < 2; t++) {
                trs.remove(0);
            }

            // 每500条数据开启一条线程
            int threadSize = 500;
            // 总数据条数
            int dataSize = trs.size();
            // 线程数
            int threadNum = dataSize / threadSize + 1;
            // 定义标记,过滤threadNum为整数
            boolean special = dataSize % threadSize == 0;

            // 创建一个线程池
            ExecutorService exec = Executors.newFixedThreadPool(threadNum);
            // 定义一个任务集合
            List<Callable<List<JsoupUser>>> tasks = new ArrayList<Callable<List<JsoupUser>>>();
            List<Element> cutList = null;

            for (int i = 0; i < threadNum; i++) {
                if (i == threadNum - 1) {
                    if (special) {
                        break;
                    }
                    cutList = trs.subList(threadSize * i, dataSize);
                } else {
                    cutList = trs.subList(threadSize * i, threadSize * (i + 1));
                }

                final List<Element> paramList = cutList;
                Callable<List<JsoupUser>> task = new Callable<List<JsoupUser>>() {
                    public List<JsoupUser> call() throws Exception {
                        List<JsoupUser> users = new ArrayList<>();
                        for (int j = 0; j < paramList.size(); j++) {
                            Element tr = paramList.get(j);
                            JsoupUser user = new JsoupUser();
                            Elements tds = tr.select("td");
                            for (int k = 0; k < tds.size(); k++) {
                                Element td = tds.get(k);
                                String text = td.text();
                                if (k == 0) {
                                    user.setId(text);
                                }
                                if (k == 1) {
                                    user.setCode(text);
                                }
                                if (k == 2) {
                                    user.setName(text);
                                }
                                if (k == 3) {
                                    user.setOrg(text);
                                }
                                if (k == 4) {
                                    user.setMobile(text);
                                }
                                if (k == 5) {
                                    user.setPhone(text);
                                }
                                if (k == 6) {
                                    user.setOrder(text);
                                }
                            }
                            users.add(user);
                        }
                        return users;
                    }
                };
                exec.submit(task);
                tasks.add(task);
            }

            List<Future<List<JsoupUser>>> results = exec.invokeAll(tasks);
            for (Future<List<JsoupUser>> future : results) {
                future.get().forEach(u -> {
                    System.out.println(u.getId() + " " + u.getCode() + " " + u.getName() + " " + u.getOrg() + " " + u.getMobile() + " " + u.getPhone() + " " + u.getOrder() + " \n");
                });
            }
            exec.shutdown();

            long end = System.currentTimeMillis();
            System.out.println("执行任务消耗了:" + (end - start));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }


}

解析为目标对象:

package com.magus.datahub.master.jsoup;

/**
 * @Description
 * @Author lxk
 * @Date 2019/12/18 0018 15:54
 * @Version V1.0
 */
public class JsoupUser {
    private String id;

    private String code;

    private String name;

    private String org;

    private String mobile;

    private String phone;

    private String order;

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getOrg() {
        return org;
    }

    public void setOrg(String org) {
        this.org = org;
    }

    public String getMobile() {
        return mobile;
    }

    public void setMobile(String mobile) {
        this.mobile = mobile;
    }

    public String getPhone() {
        return phone;
    }

    public void setPhone(String phone) {
        this.phone = phone;
    }

    public String getOrder() {
        return order;
    }

    public void setOrder(String order) {
        this.order = order;
    }
}

爬虫解析百度:

package com.magus.datahub.master.jsoup;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;


/**
 * @Description
 * @Author lxk
 * @Date 2019/12/18 0018 15:17
 * @Version V1.0
 */
public class JsoupUrlController {

    public static void main(String[] args) {
        try{
            long start = System.currentTimeMillis();
            Document document = Jsoup.connect("https://www.baidu.com/").get();
            Elements elements = document.getElementsByTag("form");
            Elements spans=elements.first().select("span");
            Elements s = spans.eq(1).select("input");
            System.out.println(s.first().attr("value"));
            long end = System.currentTimeMillis();
            System.out.println("执行任务消耗了:"+(end-start));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }



}

 

 

因篇幅问题不能全部显示,请点此查看更多更全内容

Top