java爬取图片

写在前面的叫前言

本帖子只是交流学习，如发现不妥，请联系删除

一、说明

闲来无事，学习下爬东西，闲话少说，需要jar包（请自行下载，此处推荐jar包下载地址 mvnrepository）

需要注意的事项，首先是观察下一页的地址栏规律，比如第一页;第二页；第三页

https://www.ivsky/bizhi/nvxing/
女性明星壁纸 - 高清美女壁纸_第2页 (天堂图片网)
女性明星壁纸 - 高清美女壁纸_第3页 (天堂图片网)

二、工具类

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class JsoupUtil {
    /**
     * Jspup工具类 url:采集的URL domian: 采集的域名
     */
    public static Document getDocument(String url, String domain) {
        int error_count = 0;
        Document doc = null;
        while (true) {
            if (error_count > 10) {
                break;
            }
            try {
                doc = Jsoup.connect(url).timeout(6000)
                        .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
                        .header("Accept-Encoding", "gzip,deflate,sdch").header("Connection", "keep-alive")
                        .header("referer", domain).header("cookie", "data").followRedirects(true)
                        .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").get();
            } catch (Exception e) {
                error_count++;
            }
            if (doc != null) {
                break;
            }
        }
        return doc;
    }

    public static Document parseHtml(String html) {
        return Jsoup.parse(html);
    }

}

三、爬取方法

import java.io.File;
import java.io.IOException;
import java.MalformedURLException;
import java.URL;

import org.apachemons.io.FileUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class TestSImage {

    public static final String PRE_URL = "https://www.ivsky/bizhi/nvxing/";// 目标网址

    public static final String SOURCE_HTML = "https://www.ivsky/";// 目标首页地址

    public static final String GENERATE_PATH = "E:\\2\\";// 本地保存的路径

    /**
     *
     * @param index
     *            组装下一页的计数变量，如index_1 index_2
     * @throws Exception
     *             使用递归的方式爬取图片
     */
    public static void getGirlImage(String detailHtml, String sourceHtml, String generatePath, int index)
            throws Exception {
        String url = detailHtml;
        Document doc = JsoupUtil.getDocument(url, sourceHtml);
        Element element = doc.getElementsByClass("ali").first();
        // 3.提取图片
        Document imgDoc = JsoupUtil.parseHtml(element.toString());
        Elements elements = imgDoc.select("img[src]");
        String picFile = generatePath;

        for (int i = 0; i < elements.size(); i++) {
            Element ele = elements.get(i);
            String src = ele.attr("src");// 获取到src的值
            src = "https:" + src;
            String name = src.substring(src.lastIndexOf("/") + 1, src.length());
            FileUtils.copyURLToFile(new URL(src), new File(picFile + "\\" + Math.random() * 10000 + name));
            if (i == elements.size() - 1) {// 如果为本页最后一张则组装目标页面地址
                index++;
                detailHtml = PRE_URL + "index_" + index + ".html";// 拼装下一页访问地址
                System.out.println(detailHtml);
                getGirlImage(detailHtml, sourceHtml, generatePath, index);// 继续调用获取资源
            }
        }
    }

    public static void main(String[] args) throws MalformedURLException, IOException, Exception {
        getGirlImage(PRE_URL, SOURCE_HTML, GENERATE_PATH, 0);
    }