其中导入的包是我自己写的工具,代码中需要的库是 requets ,bs4 即可 :

# encoding:utf-8
from web_tools.SpiderTools.webspider_setting import *
import requests
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Cookie':'SSCSum=2; U_TRS1=000000aa.bbe6cf5c.5ffdeb63.dfe22b1b; U_TRS2=000000aa.bbf0cf5c.5ffdeb63.31ef8546; SSCSum=1; UOR=,open.sina,; SINAGLOBAL=223.149.68.170_1610476387.876180; Apache=223.149.68.170_1610476387.876182; lxlrttp=1578733570; ULV=1610476403038:2:2:2:223.149.68.170_1610476387.876182:1610476389766; vjuids=-242e60c0c.176f7dfb9e2.0.7bd0489f7a122; vjlast=1610476403',
'Host':'open.sina',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',


}


def index(urls):
	# 正常的请求
    html= get_tools.html_get(urls,headers=headers)
    vido = html.find_all('div',class_="vido")
    # 赛选数据
    for v in vido:
        title = v.find('h2',class_="tit fbluel").text
        video_address = v.find('h2',class_="tit fbluel").find('a')['href']
        video_img= v.find('div',class_="pic").find('img')['src']
        teachers = v.find('a',class_="fblue").text
        introdutions = v.find('p',class_="intro").text
        prices = ""
        video_time = ""

#保存数据到mysql 中        
        save_tools.create_mysql('open.sina',title,video_img,video_time,prices,introdutions,teachers,video_address) # 把数据保存到mysql 中
	#赛选出本页的下一页的超链接。
    nxet_page = html.find_all('p',class_="page")[0].find_all('a') # 赛选出下一页URL
    for np in nxet_page:
        if "下一页" in str(np):
            next_url = np['href']
            #自循环调用
            index(next_url)
        else:
            pass

if __name__ == '__main__':
    for i in range(2,15): 这里是分类URL
        url = 'http://open.sina/discipline/id_{}/page_1/mn_0/'.format(i)
        index(url)

如果想调用自循环下一页URL,就必须要把第一次获取的URL写在全局变量,切变量名切勿与函数类一样。

更多推荐

python3 爬虫,摆脱手动for循环获取网页下一页,让程序自动执行请求网页下一页。