第一次真正的完成了一个爬虫,爬的是煎蛋网图片,望大神指点
爬取的时候只需要设置PAGE参数就可以来,爬取的总页数会比设置的页数多一页
# @Time : 2020-07-20
# @Author : 黎先生
# @FileName: jandan.py
# @Blog :https://blog.csdn/q79815321
import requests
from bs4 import BeautifulSoup
import re
#正则表达式匹配图片
FIND_IMG = re.compile(r'src="//(.*?)"')
#爬取页数
PAGE = 0
#初始首页
URL = "http://i.jandan/ooxx"
#请求头
HEADERS = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
"Connection":"keep-alive",
}
#下载图片
def Get_Img(URL):
#发送请求
response = requests.get(URL,headers=HEADERS)
bs = BeautifulSoup(response.text,"html.parser")
#用BeautifulSoup对象查找图片标签
data_list = bs.find_all("img",referrerpolicy="no-referrer")
#循环写入图片
for i in data_list:
#用正则匹配图片url
img = re.findall(FIND_IMG,str(i))
#写入图片
with open("./pic/{}".format(img[0][-10:]),'wb') as f:
#打开图片链接
response = requests.get(url="http://" + img[0],headers=HEADERS)
#写入
f.write(response.content)
#得到下一页的链接
def Get_Next_Url(URL):
response = requests.get(URL,headers=HEADERS)
bs = BeautifulSoup(response.text,"html.parser")
#用BeautifulSoup查找下一页的标签
html = bs.find_all('a',class_="previous-comment-page")
#正则匹配下一页链接
next_url = re.findall(r'href="//(.*?)#comments',str(html[0]))
#返回链接
return "http://" + next_url[0]
if __name__ == "__main__":
#爬取起始页得图片
Get_Img(URL)
#循环爬取
for i in range(PAGE):
#得到下一页的链接
URL = Get_Next_Url(URL)
#传入下一页得链接并开始爬取
Get_Img(URL)
更多推荐
爬虫爬取煎蛋网美女图片
发布评论