因为个人需要爬取某个吧内的所有帖子并保存下来,但是无奈不会写python,于是就想到了利用java的jsoup去做爬虫,尝试了多次后终于获取成功,主要遇到的问题还是百度的反爬机制,解决了后,整理一篇文章,以便日后查阅。

通过Jsoup爬取百度贴吧数据

通过链接提取吧内所有帖子数据,并输出为.txt或者md文件

效果图:

文件内:

先不多bb直接上代码

/**
 * @Author: xy
 * @Date: 2021/3/1 21:10
 * 爬取贴吧的所有数据,到对应的吧第一页,然后复制地址栏的地址传入createHome()方法即可
 */

@RestController
@RequestMapping(value = "xy/getTxt", produces = "text/plain;charset=utf-8")
public class JsoupController {
    /**
     * 贴吧首页请求头
     */
    public static final Map<String, String> HOME_HEARD_MAP = new HashMap<>();
    /**
     * 帖子请求头
     */
    public static final Map<String, String> CONTENT_HEARD_MAP = new HashMap<>();

    public static Map<String, String> getHomeHeardMap() {
        HOME_HEARD_MAP.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        HOME_HEARD_MAP.put("Accept-Encoding", "gzip, deflate, br");
        HOME_HEARD_MAP.put("Accept-Language", "zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6");
        HOME_HEARD_MAP.put("Connection", "keep-alive");
        if (!HOME_HEARD_MAP.containsKey("Cookie")) {
            HOME_HEARD_MAP.put("Cookie", "BIDUPSID=45401D87AD2D1AC10DC8EF4AF5BF2AAD; PSTM=1595318555; BAIDUID=45401D87AD2D1AC1B7A11A202D1726BA:FG=1; bdshare_firstime=1595501258246; H_WISE_SIDS=154034_154770_153759_151993_155858_149355_150967_156818_156286_155320_154259_155984_148867_155683_156096_154804_156622_153444_152409_131861_154772_155436_153755_151016_127969_154413_154175_155962_155331_152981_155908_150346_155803_146732_131423_154037_155394_154189_156945_155344_157024_154953_157075_151872_144966_153657_154214_154118_154801_154902_156726_155931_154145_147551_157028_153446_156606_152310_155388_154357_155864_110085_157006; MCITY=-187:; __yjs_duid=1_39c522ada6e30df532f8d767834b2a8e1614307604471; top_list=4244232993-7182834579; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=33514_33272_33570_33392_33460_22158; st_key_id=17; wise_device=0; delPer=0; PSINO=6; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1614598932,1614598938,1614651172,1614656659; BCLID=6696522698307635931; BDSFRCVID=-xLOJeC62AC9at3eh_4A8PV7WjpqhyTTH6aoV9hsteac5gjTXm08EG0PfM8g0Ku-qw2ZogKK3gOTHxKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbC8RejKBDj5Mbxv0K-vJ--o2LPoV-TrjDnCrqJ7dXUI8LNDH3xt8K6Pe0Rn7JpDWVML63P62Ktk-3bO7ttoyQJ53Q-bHKR8henc2W-F2eML1Db3hW6vMtg3ts4j5tfcoepvoDPJc3MkbyPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEK5r2SC_MJCP53j; BCLID_BFESS=6696522698307635931; BDSFRCVID_BFESS=-xLOJeC62AC9at3eh_4A8PV7WjpqhyTTH6aoV9hsteac5gjTXm08EG0PfM8g0Ku-qw2ZogKK3gOTHxKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbC8RejKBDj5Mbxv0K-vJ--o2LPoV-TrjDnCrqJ7dXUI8LNDH3xt8K6Pe0Rn7JpDWVML63P62Ktk-3bO7ttoyQJ53Q-bHKR8henc2W-F2eML1Db3hW6vMtg3ts4j5tfcoepvoDPJc3MkbyPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEK5r2SC_MJCP53j; tb_as_data=f91194c6824894d324c39c29837c6b9c50ec65fab018aeb4e474b20db842845825c96f71f4c6eb6aea2f61716f232e787570f1fcab17da5132635e67fb49ac3a2ee926ac26ab414b8dc1f022a26b6af0be5ec5feb08e47ecea40e7c3ac42af63418eb176202b934e8a5d65a31f7f67c9; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1614661234; st_sign=46af36f3; st_data=305c1867cacda6bc575bc022c406a997445a569b4e6fd53fec92a0642aee94c5d695d65fe4c5360c0f99c161476ba8dc7fb649742c1c4278775ae474ae817ef284ce4f5f60d1c0ad9b4c2c7002d944758ca3e2766e503b929c2f411069f9848b4e9bf304bec24493f1d19c3b7526fc3ce49228569294a5afc07905bed78d5368; BAIDUID_BFESS=CDDCC97066F658F1A310835A932F3477:FG=1; BA_HECTOR=04ahah0ga1a5800ko11g3ri0j0r; ZD_ENTRY=baidu; ab_sr=1.0.0_MmE3OWNlMWI1NjEwM2RiYTNmNmUwNjRiZTZiOWUxZjNhMWVjOTU5Y2ZjYzM3YTdiNWNhMTU1ZjRiZTFhNzRhZDU3NTk1Y2RkMGU4MzQyMzkzM2U4OTYzZTYxMDE3OGQ1");
        }
        HOME_HEARD_MAP.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36");
        return HOME_HEARD_MAP;
    }

    public static Map<String, String> getContentHeardMap() {
        CONTENT_HEARD_MAP.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        CONTENT_HEARD_MAP.put("Accept-Encoding", "gzip, deflate, br");
        CONTENT_HEARD_MAP.put("Accept-Language", "zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6");
        CONTENT_HEARD_MAP.put("Connection", "keep-alive");
        CONTENT_HEARD_MAP.put("Cache-Control", "max-age=0");
        if (!HOME_HEARD_MAP.containsKey("Cookie")) {
            CONTENT_HEARD_MAP.put("Cookie", "BIDUPSID=45401D87AD2D1AC10DC8EF4AF5BF2AAD; PSTM=1595318555; BAIDUID=45401D87AD2D1AC1B7A11A202D1726BA:FG=1; bdshare_firstime=1595501258246; H_WISE_SIDS=154034_154770_153759_151993_155858_149355_150967_156818_156286_155320_154259_155984_148867_155683_156096_154804_156622_153444_152409_131861_154772_155436_153755_151016_127969_154413_154175_155962_155331_152981_155908_150346_155803_146732_131423_154037_155394_154189_156945_155344_157024_154953_157075_151872_144966_153657_154214_154118_154801_154902_156726_155931_154145_147551_157028_153446_156606_152310_155388_154357_155864_110085_157006; MCITY=-187:; BDSFRCVID_BFESS=oj0OJexroG3V4WbeM-8t8PVZdeKK0gOTDYLtOwXPsp3LGJLVgVbOEG0PtEhTCoub_2AUogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbRO4-TF-jjQyDU5; __yjs_duid=1_39c522ada6e30df532f8d767834b2a8e1614307604471; top_list=4244232993-7182834579; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=33514_33272_33570_33392_33460_22158; st_key_id=17; wise_device=0; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1614598461,1614598932,1614598938,1614651172; delPer=0; PSINO=6; BA_HECTOR=018g2g0l25052l0g3a1g3r9t70q; tb_as_data=f91194c6824894d324c39c29837c6b9c50ec65fab018aeb4e474b20db842845825c96f71f4c6eb6aea2f61716f232e78d3c5a1aeac7bb1e60c92309e4169db1b26e5605e313a1f6752c4b482431b1fe7faa5841c98f1d6409e6296b85974757ebd72ed37a64a1ff6d29f3ddc6e838db0; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1614656468; ab_sr=1.0.0_MDAwYzVjYzdmZjhhNzYzNjhlMGZmZWRiN2FkYmJiZGMwM2VmZmJmY2ZmNDljNWRmYWEyNzE2NDk5YzRhNGExNjQzNTEzM2I5YjMwZGRkYTgwMmE1MjQyNWFmNjc1ZGUw; st_data=305c1867cacda6bc575bc022c406a997445a569b4e6fd53fec92a0642aee94c5d695d65fe4c5360c0f99c161476ba8dc7fb649742c1c4278775ae474ae817ef284ce4f5f60d1c0ad9b4c2c7002d944758ca3e2766e503b929c2f411069f9848bc1e6ab946a724413075895137bad90ae1601113654bfc92f8dec8bf5d540fc12; st_sign=6215e595");
        }
        CONTENT_HEARD_MAP.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36");
        return CONTENT_HEARD_MAP;
    }

    /**
     * 获取帖子列表
     */
    @PostMapping("public/createHome")
    public String createHome(String href) throws IOException {
        //创建连接,获取响应结果
        Document doc = Jsoup.connect(href)
                .headers(getHomeHeardMap())
                .get();
        //过滤贴吧注解
        String html = doc.body().toString().replace("<!--", "").replace("-->", "");
        if (html.contains("网络不给力")) {
            System.out.println("已触发百度反爬虫验证,程序已暂停,请更换贴吧列表cookie,终止链接是:\"" + href + "\"");
            System.out.println("在下面粘贴从浏览器复制的cookie");
            String next = new Scanner(System.in).nextLine();
            //更换cookie
            HOME_HEARD_MAP.put("Cookie", next);
            //重新调用
            createHome(href);
        }
        Document parse;
        try {
            parse = Jsoup.parse(html);
        } catch (Exception e) {
            throw new RuntimeException("解析贴外列表时异常,请尝试更换Jsoup版本");
        }

        //获取当前页贴子列表
        Elements aContent = parse.select("a.j_th_tit");
        TotalCount += aContent.size();
        for (Element element : aContent) {
            String text = element.text();
            if ("专用水楼".equals(text)) {
                //跳过水楼,因为实在是太大了.....
                continue;
            }
            href = element.attr("href");
            System.out.println("进入标题为:" + text + "的帖子,二级地址是:" + href);
            content("https://tieba.baidu" + href, text);
        }
        //是否有下一页
        Elements select = parse.select("a[class~=^next.pagination-item]");
        System.out.println("当前页数为:" + PAGE + ",当前页有:" + aContent.size() + "条帖子");
        if (select != null && select.size() == 1) {
            Element element = select.get(0);
            if ("下一页".equals(element.text().replace(">", ""))) {
                href = element.attr("href");
                //递归获取数据
                PAGE++;
                createHome("https:" + href);
            }
        }
        System.out.println("吧内帖子总数为:" + TotalCount + ",总楼层为:" + ContentTotalCount);
        return "ok";
    }

    /**
     * 获取帖子内容
     */
    public static void content(String contentHref, String titlePath) throws IOException {
        //创建连接,获取响应结果
        Document doc = Jsoup.connect(contentHref)
                .headers(getContentHeardMap())
                .get();
        //过滤贴吧注解
        String html = doc.body().toString().replace("<!--", "").replace("-->", "");
        if (html.contains("网络不给力")) {
            System.out.println("已触发百度反爬虫验证,程序暂停,请更换贴内cookie,终止链接为:\"" + contentHref + "\"");
            System.out.println("在下面粘贴从浏览器复制的cookie");
            String next = new Scanner(System.in).nextLine();
            //更换cookie
            CONTENT_HEARD_MAP.put("Cookie", next);
            System.out.println("cookie是:" + CONTENT_HEARD_MAP.get("Cookie"));
            //重新调用
            content(contentHref, titlePath);
        }
        Document parse;
        try {
            parse = Jsoup.parse(html);
        } catch (Exception e) {
            throw new RuntimeException("解析帖子内容时异常,请尝试更换Jsoup版本");
        }
        //查找所有楼层
        Elements select = parse.select("div[class~=^l_post.l_post_bright.j_l_post.clearfix]");
        ContentTotalCount += select.size();
        //转义路径符
        titlePath = titlePath.replace("/", "-").replace("<", "《")
                .replace(">", "》").replace("|","-").replace("\\","");
        File file = new File("D:/IO/" + titlePath + ".md");
        if (!file.exists()) {
            file.createNewFile();
        }
        //创建文件输出流
        FileWriter fileWriter = new FileWriter(file, true);
        for (Element allElement : select) {
            Elements elementsByAttribute = allElement.getElementsByAttribute("data-locate");
            //过滤贴吧坑爹广告
            if ("".equals(elementsByAttribute.toString())) {
                //查出层主名称
                Elements name = allElement.select("a[class~=^p_author_name]");
                //查出帖子内容
                Elements content = allElement.select("div[class~=^d_post_content.j_d_post_content]");
                //查出发帖日期和楼数
                Elements span = allElement.select("span[class~=^tail-info]");
                String lou = "";
                String date = "";
                for (Element element : span) {
                    String text = element.text();
                    if (text.endsWith("楼")) {
                        lou = text;
                    }
                    if (text.contains("-")) {
                        date = text;
                    }
                }
                //写入数据
                fileWriter.write("当前层主为:\"" + name.text() + "\"");
                if (!"".equals(content.text())) {
                    fileWriter.write("\r\n");
                    fileWriter.write("内容是:\"" + content.text() + "\"");
                }
                fileWriter.write("\r\n");
                System.out.println("当前层主为:\"" + name.text() + ",帖子内容是:" + content.text());
                for (Element element : content) {
                    Elements imgs = element.select("img[class~=^BDE_Image]");
                    String src = imgs.attr("src");
                    if (!"".equals(src)) {
                        fileWriter.write("当前楼层图片内容为:\"" + src + "\"");
                        fileWriter.write("\r\n");
                        System.out.println("当前楼层图片内容为:\"" + src + "\"");
                    }
                }
                fileWriter.write(lou + "  ");
                fileWriter.write(date);
                fileWriter.write("\r\n");
                fileWriter.write("-----------------------------------------------------------------------------------分割线-----------------------------------------------------------------------------------");
                fileWriter.write("\r\n");
                //刷新缓存
                fileWriter.flush();
            }
        }
        //是否有下一页
        Elements a = parse.select("li[class~=^l_pager.pager_theme_5]");
        if (a.size() != 0) {
            Elements a1 = a.get(0).getElementsByTag("a");
            for (Element next : a1) {
                if ("下一页".equals(next.text())) {
                    contentHref = next.attr("href");
                    //递归获取数据
                    content("https://tieba.baidu" + contentHref, titlePath);
                }
            }
        }
        //关流
        fileWriter.close();
        System.out.println("当前帖子楼层数为:" + ContentTotalCount);
        ReturnBody.success();
    }
}

导入依赖

			<!--  jsoup解析HTML  -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

注意:如果jsoup版本为1.12.2的时候,jsoup.parse()的时候,在某些特定页面会抛出IoException,更换版本号为1.11.3即可

利用Jsoup创建连接

利用jsoup的connect来和百度贴吧服务器建立连接,类似于HttpClient请求,传入一个请求地址,会返回一个Document对象,这里以“抗压背锅吧”为例子,进入浏览器,获取到贴吧列表的请求地址,作为连接传入到connect方法中


headers是配置请求头,来模仿浏览器进行获取html响应结果,下面会解释。

//创建连接,获取响应结果
        Document doc = Jsoup.connect(href)
                .headers(getHomeHeardMap())
                .get();

配置请求头

由于贴吧对页面请求做了ip限制,当同一个局域网ip内多次请求贴吧列表页面或者贴子内容页面的时候,会弹出百度反爬的一个安全验证,所以我们配置两个通用的请求头来模仿真人操作绕过他的验证

 	/** 贴吧首页请求头*/
    public static final Map<String, String> HOME_HEARD_MAP = new HashMap<>();
    
    /** 贴子内容请求头*/
    public static final Map<String, String> CONTENT_HEARD_MAP = new HashMap<>();

    public static Map<String, String> getHomeHeardMap() {
        HOME_HEARD_MAP.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        HOME_HEARD_MAP.put("Accept-Encoding", "gzip, deflate, br");
        HOME_HEARD_MAP.put("Accept-Language", "zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6");
        HOME_HEARD_MAP.put("Connection", "keep-alive");
        //这里为什么要判断cookie会在下面解释
        if (!HOME_HEARD_MAP.containsKey("Cookie")) {
            HOME_HEARD_MAP.put("Cookie", "BIDUPSID=45401D87AD2D1AC10DC8EF4AF5BF2AAD; PSTM=1595318555; BAIDUID=45401D87AD2D1AC1B7A11A202D1726BA:FG=1; bdshare_firstime=1595501258246; H_WISE_SIDS=154034_154770_153759_151993_155858_149355_150967_156818_156286_155320_154259_155984_148867_155683_156096_154804_156622_153444_152409_131861_154772_155436_153755_151016_127969_154413_154175_155962_155331_152981_155908_150346_155803_146732_131423_154037_155394_154189_156945_155344_157024_154953_157075_151872_144966_153657_154214_154118_154801_154902_156726_155931_154145_147551_157028_153446_156606_152310_155388_154357_155864_110085_157006; MCITY=-187:; __yjs_duid=1_39c522ada6e30df532f8d767834b2a8e1614307604471; top_list=4244232993-7182834579; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=33514_33272_33570_33392_33460_22158; st_key_id=17; wise_device=0; delPer=0; PSINO=6; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1614598932,1614598938,1614651172,1614656659; BCLID=6696522698307635931; BDSFRCVID=-xLOJeC62AC9at3eh_4A8PV7WjpqhyTTH6aoV9hsteac5gjTXm08EG0PfM8g0Ku-qw2ZogKK3gOTHxKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbC8RejKBDj5Mbxv0K-vJ--o2LPoV-TrjDnCrqJ7dXUI8LNDH3xt8K6Pe0Rn7JpDWVML63P62Ktk-3bO7ttoyQJ53Q-bHKR8henc2W-F2eML1Db3hW6vMtg3ts4j5tfcoepvoDPJc3MkbyPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEK5r2SC_MJCP53j; BCLID_BFESS=6696522698307635931; BDSFRCVID_BFESS=-xLOJeC62AC9at3eh_4A8PV7WjpqhyTTH6aoV9hsteac5gjTXm08EG0PfM8g0Ku-qw2ZogKK3gOTHxKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbC8RejKBDj5Mbxv0K-vJ--o2LPoV-TrjDnCrqJ7dXUI8LNDH3xt8K6Pe0Rn7JpDWVML63P62Ktk-3bO7ttoyQJ53Q-bHKR8henc2W-F2eML1Db3hW6vMtg3ts4j5tfcoepvoDPJc3MkbyPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEK5r2SC_MJCP53j; tb_as_data=f91194c6824894d324c39c29837c6b9c50ec65fab018aeb4e474b20db842845825c96f71f4c6eb6aea2f61716f232e787570f1fcab17da5132635e67fb49ac3a2ee926ac26ab414b8dc1f022a26b6af0be5ec5feb08e47ecea40e7c3ac42af63418eb176202b934e8a5d65a31f7f67c9; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1614661234; st_sign=46af36f3; st_data=305c1867cacda6bc575bc022c406a997445a569b4e6fd53fec92a0642aee94c5d695d65fe4c5360c0f99c161476ba8dc7fb649742c1c4278775ae474ae817ef284ce4f5f60d1c0ad9b4c2c7002d944758ca3e2766e503b929c2f411069f9848b4e9bf304bec24493f1d19c3b7526fc3ce49228569294a5afc07905bed78d5368; BAIDUID_BFESS=CDDCC97066F658F1A310835A932F3477:FG=1; BA_HECTOR=04ahah0ga1a5800ko11g3ri0j0r; ZD_ENTRY=baidu; ab_sr=1.0.0_MmE3OWNlMWI1NjEwM2RiYTNmNmUwNjRiZTZiOWUxZjNhMWVjOTU5Y2ZjYzM3YTdiNWNhMTU1ZjRiZTFhNzRhZDU3NTk1Y2RkMGU4MzQyMzkzM2U4OTYzZTYxMDE3OGQ1");
        }
        HOME_HEARD_MAP.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36");
        return HOME_HEARD_MAP;
    }

    public static Map<String, String> getContentHeardMap() {
        CONTENT_HEARD_MAP.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        CONTENT_HEARD_MAP.put("Accept-Encoding", "gzip, deflate, br");
        CONTENT_HEARD_MAP.put("Accept-Language", "zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6");
        CONTENT_HEARD_MAP.put("Connection", "keep-alive");
        CONTENT_HEARD_MAP.put("Cache-Control", "max-age=0");
        //同理
        if (!HOME_HEARD_MAP.containsKey("Cookie")) {
            CONTENT_HEARD_MAP.put("Cookie", "BIDUPSID=45401D87AD2D1AC10DC8EF4AF5BF2AAD; PSTM=1595318555; BAIDUID=45401D87AD2D1AC1B7A11A202D1726BA:FG=1; bdshare_firstime=1595501258246; H_WISE_SIDS=154034_154770_153759_151993_155858_149355_150967_156818_156286_155320_154259_155984_148867_155683_156096_154804_156622_153444_152409_131861_154772_155436_153755_151016_127969_154413_154175_155962_155331_152981_155908_150346_155803_146732_131423_154037_155394_154189_156945_155344_157024_154953_157075_151872_144966_153657_154214_154118_154801_154902_156726_155931_154145_147551_157028_153446_156606_152310_155388_154357_155864_110085_157006; MCITY=-187:; BDSFRCVID_BFESS=oj0OJexroG3V4WbeM-8t8PVZdeKK0gOTDYLtOwXPsp3LGJLVgVbOEG0PtEhTCoub_2AUogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbRO4-TF-jjQyDU5; __yjs_duid=1_39c522ada6e30df532f8d767834b2a8e1614307604471; top_list=4244232993-7182834579; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=33514_33272_33570_33392_33460_22158; st_key_id=17; wise_device=0; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1614598461,1614598932,1614598938,1614651172; delPer=0; PSINO=6; BA_HECTOR=018g2g0l25052l0g3a1g3r9t70q; tb_as_data=f91194c6824894d324c39c29837c6b9c50ec65fab018aeb4e474b20db842845825c96f71f4c6eb6aea2f61716f232e78d3c5a1aeac7bb1e60c92309e4169db1b26e5605e313a1f6752c4b482431b1fe7faa5841c98f1d6409e6296b85974757ebd72ed37a64a1ff6d29f3ddc6e838db0; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1614656468; ab_sr=1.0.0_MDAwYzVjYzdmZjhhNzYzNjhlMGZmZWRiN2FkYmJiZGMwM2VmZmJmY2ZmNDljNWRmYWEyNzE2NDk5YzRhNGExNjQzNTEzM2I5YjMwZGRkYTgwMmE1MjQyNWFmNjc1ZGUw; st_data=305c1867cacda6bc575bc022c406a997445a569b4e6fd53fec92a0642aee94c5d695d65fe4c5360c0f99c161476ba8dc7fb649742c1c4278775ae474ae817ef284ce4f5f60d1c0ad9b4c2c7002d944758ca3e2766e503b929c2f411069f9848bc1e6ab946a724413075895137bad90ae1601113654bfc92f8dec8bf5d540fc12; st_sign=6215e595");
        }
        CONTENT_HEARD_MAP.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36");
        return CONTENT_HEARD_MAP;
    }

这里的数据在首页的第一个请求头那里去复制过来即可

接下来就是调用jsoup来获取html内容解析了

		//创建连接,获取响应结果
        Document doc = Jsoup.connect(href)
                .headers(getHomeHeardMap())
                .get();
        //过滤贴吧注解
        String html = doc.body().toString().replace("<!--", "").replace("-->", "");
        if (html.contains("网络不给力")) {
            System.out.println("已触发百度反爬虫验证,程序已暂停,请更换贴吧列表cookie,终止链接是:\"" + href + "\"");
            System.out.println("在下面粘贴从浏览器复制的cookie");
            String next = new Scanner(System.in).nextLine();
            //更换cookie
            HOME_HEARD_MAP.put("Cookie", next);
            System.out.println("cookie是:" + HOME_HEARD_MAP.get("Cookie"));
            //重新调用
            createHome(href);
        }
        Document parse;
        try {
            parse = Jsoup.parse(html);
        } catch (Exception e) {
            throw new RuntimeException("解析贴外列表时异常,请尝试更换Jsoup版本");
        }

这里的过滤操作是因为请求贴吧列表的时候,百度会对列表的主内容,也就是body区域做一个注释处理,如下图:

不知道为什么百度会做这样一个操作,在之前的版本中,2020年之前,都不会这样,但是既然他做了,字符串替换掉就是了。



这一部分就是用来判断是否已触发贴吧的安全验证,因为爬取一个吧内的所有帖子,会请求成千上万次页面,达到一定次数后,再去请求就会返回安全验证的页面,自然也就解析不到里面的内容了。

一开始百度的时候,是说添加一个Accept请求头就可以了,但是实际上就算添加了这个请求头,再多次请求后,一样会弹出这个页面,在我多次尝试之后,发现最主要的是安全验证通过之后,贴吧响应回的cookie值,这个cookie保持安全验证的值就可以正常请求响应了,我这里做了一个键盘输入的操作,判断触发验证后阻塞程序,然后去浏览器通过安全验证后复制通过验证的cookie值,粘贴过来,程序递归再次调用自己,就可以了。(也可以做一个请求,线程阻塞,输入cookie后响应回来,这样的操作,看个人喜好,我为了方便,就直接用键盘录入了。)

这里就是刚刚什么请求头为什么要判空的原因,不判空的话,递归调方法,由于getHomeHeardMap()的初始化,还会是一开始配置的过期cookie。


接下来就是分析他的页面结构,f12得知,它的所有帖子列表,包括跳转的二级地址,都是在一个a标签下带有"j_th_tit"class属性的样子的

所以这里我们可以通过jsoup的select查询来查出当前页面的所有帖子列表,并进行操作,这里的思路是获取到当前的帖子列表后,获取到href值,然后调用子方法创立链接,子方法后面贴上

//获取当前页贴子列表
        Elements aContent = parse.select("a.j_th_tit");
        for (Element element : aContent) {
            String text = element.text();
            if ("专用水楼".equals(text)) {
                //跳过水楼,因为实在是太大了.....
                continue;
            }
            href = element.attr("href");
            System.out.println("进入标题为:" + text + "的帖子,二级地址是:" + href);
            //content是进入帖子内,创建连接解析的方法
            content("https://tieba.baidu" + href, text);
        }

由于我们要获取吧内所有贴子,所以要获得下一页的地址,f12控制台获得下一页的标签,通过select选取获得数据,遍历后,如果内容为“下一页”,那么就带着下一页的地址,递归调用自身

//判断是否有下一页
        Elements select = parse.select("a[class~=^next.pagination-item]");
        System.out.println("当前页数为:" + PAGE + ",当前页有:" + aContent.size() + "条帖子");
        if (select != null && select.size() == 1) {
            Element element = select.get(0);
            if ("下一页".equals(element.text().replace(">", ""))) {
                href = element.attr("href");
                //递归获取每一页的帖子
                createHome("https:" + href);
            }
        }

解析贴内数据

上面一部分开启链接和绕过验证都是相同的就不解释了,这里做了一些io操作和过滤操作,相信开代码都能看懂,都是大同小异的

//查找所有楼层
        Elements select = parse.select("div[class~=^l_post.l_post_bright.j_l_post.clearfix]");
        ContentTotalCount += select.size();
        //转义路径符
        titlePath = titlePath.replace("/", "-").replace("<", "《")
                .replace(">", "》").replace("|","-").replace("\\","");
        File file = new File("D:/IO/" + titlePath + ".md");
        if (!file.exists()) {
            file.createNewFile();
        }
        //创建文件输出流
        FileWriter fileWriter = new FileWriter(file, true);
        for (Element allElement : select) {
            Elements elementsByAttribute = allElement.getElementsByAttribute("data-locate");
            //过滤贴吧坑爹广告
            if ("".equals(elementsByAttribute.toString())) {
                //查出层主名称
                Elements name = allElement.select("a[class~=^p_author_name]");
                //查出帖子内容
                Elements content = allElement.select("div[class~=^d_post_content.j_d_post_content]");
                //查出发帖日期和楼数
                Elements span = allElement.select("span[class~=^tail-info]");
                String lou = "";
                String date = "";
                for (Element element : span) {
                    String text = element.text();
                    if (text.endsWith("楼")) {
                        lou = text;
                    }
                    if (text.contains("-")) {
                        date = text;
                    }
                }
                //写入数据
                fileWriter.write("当前层主为:\"" + name.text() + "\"");
                if (!"".equals(content.text())) {
                    fileWriter.write("\r\n");
                    fileWriter.write("内容是:\"" + content.text() + "\"");
                }
                fileWriter.write("\r\n");
                System.out.println("当前层主为:\"" + name.text() + ",帖子内容是:" + content.text());
                for (Element element : content) {
                    Elements imgs = element.select("img[class~=^BDE_Image]");
                    String src = imgs.attr("src");
                    if (!"".equals(src)) {
                        fileWriter.write("当前楼层图片内容为:\"" + src + "\"");
                        fileWriter.write("\r\n");
                        System.out.println("当前楼层图片内容为:\"" + src + "\"");
                    }
                }
                fileWriter.write(lou + "  ");
                fileWriter.write(date);
                fileWriter.write("\r\n");
                fileWriter.write("-----------------------------------------------------------------------------------分割线-----------------------------------------------------------------------------------");
                fileWriter.write("\r\n");
                //刷新缓存
                fileWriter.flush();
            }
        }

主要的步骤就是获取需要的数据内的class属性名,然后通过select选择器,通过正则去匹配出数据,最后处理输出即可。

更多推荐

java爬取百度贴吧吧内所有帖子数据(图文详解)