第一次真正的完成了一个爬虫,爬的是煎蛋网图片,望大神指点

爬取的时候只需要设置PAGE参数就可以来,爬取的总页数会比设置的页数多一页

# @Time    : 2020-07-20
# @Author  : 黎先生
# @FileName: jandan.py
# @Blog    :https://blog.csdn/q79815321


import requests
from bs4 import BeautifulSoup
import re

#正则表达式匹配图片
FIND_IMG = re.compile(r'src="//(.*?)"')

#爬取页数
PAGE = 0

#初始首页
URL = "http://i.jandan/ooxx"

#请求头
HEADERS = {
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
    "Connection":"keep-alive",
    }


#下载图片
def Get_Img(URL):

    #发送请求
    response = requests.get(URL,headers=HEADERS)

    bs = BeautifulSoup(response.text,"html.parser")

    #用BeautifulSoup对象查找图片标签
    data_list = bs.find_all("img",referrerpolicy="no-referrer")

    #循环写入图片
    for i in data_list:

        #用正则匹配图片url
        img = re.findall(FIND_IMG,str(i))

        #写入图片
        with open("./pic/{}".format(img[0][-10:]),'wb') as f:

            #打开图片链接
            response = requests.get(url="http://" + img[0],headers=HEADERS)

            #写入
            f.write(response.content)

#得到下一页的链接
def Get_Next_Url(URL):
    response = requests.get(URL,headers=HEADERS)
    bs = BeautifulSoup(response.text,"html.parser")

    #用BeautifulSoup查找下一页的标签
    html = bs.find_all('a',class_="previous-comment-page")

    #正则匹配下一页链接
    next_url = re.findall(r'href="//(.*?)#comments',str(html[0]))
    
    #返回链接
    return "http://" + next_url[0]

if __name__ == "__main__":

    #爬取起始页得图片
    Get_Img(URL)
    
    #循环爬取
    for i in range(PAGE):

        #得到下一页的链接
        URL = Get_Next_Url(URL)

        #传入下一页得链接并开始爬取
        Get_Img(URL)

更多推荐

爬虫爬取煎蛋网美女图片