selenium+webdriver 案例实战

文章目录

- - 一、环境准备
  - 二、咨询师信息爬取
  - 三、问答数据的采集

一、环境准备

代码环境：python3.6
chrome环境：在chrome浏览器中敲入chrome://version。前往链接下载对应版本的chormedriver.exe。
所需第三方库：Beautifulsoup4、selenium、time、pandas（读写csv）

二、咨询师信息爬取

网址：网址
目标：爬取共205咨询师的ID和姓名。
分析：姓名就不多解释，主要解释ID。打开开发者工具，随便定位一个咨询师。可以定位其在html中的标签。我们只需要解析出href的数据，抽取其中的数字即可。

由于采集咨询师的ID和姓名并不需要点击进入咨询师的详情界面，因此这里不需要使用webdriver，可以使用bs4等方式解析网页获取数据，这样速度更快，编写代码也更简单！完整代码如下：

#!/usr/bin/env python
# encoding=utf-8
"""
    Author：YJY
    Need:爬取壹点零网站的咨询师列表LIST
"""

import codecs
import requests
from bs4 import BeautifulSoup
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


DOWNLOAD_URL = 'https://ydl/experts/'


def download_page(url):
    return requests.get(url, headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    }).content


def parse_html(html, list):
    # print("parse_html执行一次")
    try:
        soup = BeautifulSoup(html)
        doctor_list_soup = soup.find('div', attrs={'class': 'expertsList_items'})
        # find_all获取该页面的所有咨询师信息。每个咨询师完整信息包含在<item>标签下。
        for doctor_item in doctor_list_soup.find_all('div', attrs={'class': 'item'}):
            doctor_info = doctor_item.find('div', attrs={'class': 'info'})
            doctor_information = doctor_info.find('h3').find('a')
            # 获取姓名
            doctor_name = doctor_information.getText()
            # 获取id
            doctor_id = doctor_information['href']
            doctor_id = ''.join(filter(lambda i: i in ['#'] or i.isdigit(), doctor_id))
            # 将姓名和ID添加至咨询师信息列表
            doctor_information_list.append([doctor_id, doctor_name.replace('\n','').replace(' ','')])

        # 判断下一页是否存在
        next_page = soup.find('li', attrs={'class': 'next'}).find('a')
        print(next_page['href'])
        if next_page:
        	#下一页存在，则返回当前的咨询师列表信息和下一页的url
            return doctor_information_list, DOWNLOAD_URL + next_page['href']
        # 下一页不存在，则返回当前的咨询师列表，url设置为None终止while循环
        return doctor_information_list, None
	# 对采集的异常进行简单处理
    except Exception as e:
        return doctor_information_list, DOWNLOAD_URL + next_page['href']

def main():
    url = DOWNLOAD_URL
    page = 0
    doctors = []
    try:
        while url:
            # 打印当前正在打印的页数(前两次的抓取结果均为第一页，没有太仔细想这个bug，不过问题不大)
            print("正在打印第%d页" % page)
            page += 1
            # 解析函数
            try:
                html = download_page(url)
                doctors, url = parse_html(html, doctors)
            except Exception as e:
            	# 第102页会异常（具体原因没有细想），所以遇到异常时手动设置下一页URL
                html = download_page(DOWNLOAD_URL + '/experts?experts=&page=' + str(page))
    # 简单处理异常
    except Exception as e:
        print(e)
    # 将列表转换为DataFrame，导出至csv文件
    name = ['DocID', 'DocName']
    doctor_information = pd.DataFrame(columns=name, data=doctors)
    doctor_information.to_csv('data/doctor_information.csv', encoding='utf-8', index=None)


if __name__ == '__main__':
    main()

这样我们就得到了所有咨询师的ID和姓名信息，便于后续数据的采集。代码写的比较匆忙，还可以有很多修改，仅仅为了实现功能所以糙了点，异常处理都很随意哈哈哈哈，不要太纠结代码。

三、问答数据的采集

下面开始代码编写：

放在前面，对于xpath和selenium、webdriver不熟悉的朋友参考以下教程：

xpath：xpath教程
webdriver：selenium+webdriver教程

首先，导入相关包

import pandas as pd
import time
from selenium import webdriver

设置YiDianLing类，包含五个属性和两个函数。

class YiDianLing(object):

    def __init__(self):
        self.url = "https://ydl/experts/"
        # 创建浏览器
        self.driver = webdriver.Chrome(r"chromedriver.exe")
        # 创建子浏览器（因为webdriver返回前一页，可能会发生数据丢失，因此我们直接采用子浏览器打开咨询师的具体问答，主浏览器仍保持咨询师问答列表，这里有很多坑，大家需要小心！）
        self.driver_son = webdriver.Chrome(r"chromedriver.exe")
        # 读取咨询师的ID和姓名
        self.DoctorIdlist = pd.read_csv("data/doctor_information.csv")['DocID']
        self.DoctorNamelist = pd.read_csv("data/doctor_information.csv")['DocName']

	# 爬取问答数据
    def AskDataCrawl(self):
 	# 爬取文章数据（不多说明，代码在最后）
    def ArticleDataCrawl(self):

创建问答数据列表（最后将该列表转换为DataFrame，导出csv）。遍历前面爬取的咨询师信息文件，遍历每个咨询师的主页。

# 创建问答数据列表
ArticleData = []
# 遍历咨询师列表
for i in range(len(self.DoctorIdlist)):
    # 访问第i+1个咨询师的主界面
        doctorurl = self.url + str(self.DoctorIdlist[i])
        self.driver.get(doctorurl)
        time.sleep(1)

点击页面中的“查看所有”，跳转到问答界面。

# 跳转到问答板快（查看所有）
self.driver.find_element_by_link_text('查看全部').click()
# 关闭旧界面
self.driver.switch_to.window(self.driver.window_handles[0])
self.driver.close()
# 设置当前页面为最后一个界面
self.driver.switch_to.window(self.driver.window_handles[-1])
time.sleep(1)

遍历当前界面的所有问答数据

# 获取所有问答
questionsList = self.driver.find_elements_by_class_name('item')
for question in questionsList:
	time.sleep(3)
	try:# 获取问题URL
		QuestionUrl = question.find_element_by_tag_name('a').get_attribute('href')
        print(QuestionUrl)
        # 获取问题ID和咨询师回复ID
        QuestionID = QuestionUrl.split('/')[4]
        AnswerID = QuestionID + str(self.DoctorIdlist[i])
        # 打开子浏览器
        self.driver_son.get(QuestionUrl)
        time.sleep(1)
        # 获取问题内容
        QuestionText = self.driver_son.find_element_by_xpath('//*[@class="content"]/p').text
        QuestionText = QuestionText.replace('\n', '')
        # 获取提问时间
        QuestionDate = self.driver_son.find_element_by_class_name('ask_right').text.split('\n')
        QuestionDate = QuestionDate[0]
        # 咨询师回复时间
        AnswerDate = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/time').text
        # 获取咨询师回复内容
        AnswerText = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/p').text
        AnswerText = AnswerText.replace('\n', '')
        # 获取咨询师回复收到的感谢数量
        AnswerThanks = self.driver_son.find_element_by_xpath('//*[@class="votable"]/a/font').text
        # 添加至列表
        AskData.append([self.DoctorIdlist[i], self.DoctorNamelist[i], QuestionID, QuestionText, QuestionDate, AnswerID, AnswerDate, AnswerText, AnswerThanks])
        
	except Exception as e:
		continue

跳转至下一页，直至无法跳转时退出循环，遍历下一个咨询师的主页，重复上述循环。

# 跳转到下一页
try:
	page += 1
    self.driver.get('https://ydl/experts/' + str(self.DoctorIdlist[i]) + '/answerList/p' + str(page))
    time.sleep(3)
except Exception as e:
    break

完整代码如下，包括问大数据获取和文章数据的获取。代码编写比较匆忙，或多或少有些小问题，还请大家见谅！

#!/usr/bin/env python
# encoding=utf-8
"""
    Author：YJY
    Need:爬取壹点零网站的咨询师问答数据和文章数据
"""

import pandas as pd
import time
from selenium import webdriver


class YiDianLing(object):

    def __init__(self):
        self.url = "https://ydl/experts/"
        # 创建浏览器
        self.driver = webdriver.Chrome(r"chromedriver.exe")
        self.driver_son = webdriver.Chrome(r"chromedriver.exe")
        self.DoctorIdlist = pd.read_csv("data/doctor_information.csv")['DocID']
        self.DoctorNamelist = pd.read_csv("data/doctor_information.csv")['DocName']

    def AskDataCrawl(self):
        # 创建问答数据列表
        AskData = []
        # 遍历咨询师列表
        nums = 1
        for i in range(len(self.DoctorIdlist)):
            # 访问第i个咨询师的主界面
            doctorurl = self.url + str(self.DoctorIdlist[i])
            self.driver.get(doctorurl)
            time.sleep(1)
            # 跳转到问答板快（查看所有）
            self.driver.find_element_by_link_text('查看全部').click()
            self.driver.switch_to.window(self.driver.window_handles[0])
            self.driver.close()
            self.driver.switch_to.window(self.driver.window_handles[-1])
            time.sleep(1)
            # 问题列表界面page
            page = 1
            while True:
                # 获取所有问答
                questionsList = self.driver.find_elements_by_class_name('item')
                for question in questionsList:
                    time.sleep(3)
                    try:
                        # 获取问题URL
                        QuestionUrl = question.find_element_by_tag_name('a').get_attribute('href')
                        print(QuestionUrl)
                        # 获取问题ID和咨询师回复ID
                        QuestionID = QuestionUrl.split('/')[4]
                        AnswerID = QuestionID + str(self.DoctorIdlist[i])
                        # 打开子浏览器
                        self.driver_son.get(QuestionUrl)
                        time.sleep(1)

                        # 获取问题内容
                        QuestionText = self.driver_son.find_element_by_xpath('//*[@class="content"]/p').text
                        QuestionText = QuestionText.replace('\n', '')
                        # 获取提问时间
                        QuestionDate = self.driver_son.find_element_by_class_name('ask_right').text.split('\n')
                        QuestionDate = QuestionDate[0]
                        # 咨询师回复时间
                        AnswerDate = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/time').text
                        # 获取咨询师回复内容
                        AnswerText = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/p').text
                        AnswerText = AnswerText.replace('\n', '')
                        # 获取咨询师回复收到的感谢数量
                        AnswerThanks = self.driver_son.find_element_by_xpath('//*[@class="votable"]/a/font').text
                        # 添加至列表
                        AskData.append([self.DoctorIdlist[i], self.DoctorNamelist[i], QuestionID, QuestionText, QuestionDate, AnswerID, AnswerDate, AnswerText, AnswerThanks])
                        print(AskData)
                        # self.driver_son.close()
                    except Exception as e:
                        continue

                # 跳转到下一页
                try:
                    page += 1
                    self.driver.get('https://ydl/experts/' + str(self.DoctorIdlist[i]) + '/answerList/p' + str(page))
                    time.sleep(3)
                except Exception as e:
                    break
            nums += 1
        # 导出csv
        Name = ['DocID', 'DocName', 'QuestionID', 'QuestionText', 'QuestionDate', 'AnswerID', 'AnswerDate', 'AnswerText', 'AnswerThanks']
        df = pd.DataFrame(columns=Name, data=AskData)
        df.to_csv('data/doctor_ask.csv')


    def ArticleDataCrawl(self):
        # 创建问答数据列表
        ArticleData = []
        # 遍历咨询师列表
        for i in range(len(self.DoctorIdlist)):
            # 访问第i+1个咨询师的主界面
            doctorurl = self.url + str(self.DoctorIdlist[i])
            self.driver.get(doctorurl)
            time.sleep(1)
            # 判断是否有文章，有则进入该板块
            try:
                if '文章' in self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[5]').text:
                    self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[5]').click()
                    self.driver.switch_to.window(self.driver.window_handles[0])
                    self.driver.close()
                    self.driver.switch_to.window(self.driver.window_handles[-1])
                    time.sleep(1)
                elif '文章' in self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[6]').text:
                    self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[6]').click()
                    self.driver.switch_to.window(self.driver.window_handles[0])
                    self.driver.close()
                    self.driver.switch_to.window(self.driver.window_handles[-1])
                    time.sleep(1)
                else:
                    continue
            except Exception as e:
                continue

            # 文章列表界面page
            page = 1
            err_appear = False
            while not err_appear:
                articlesList = self.driver.find_elements_by_xpath('//*[@class="ui-content testings index testings_index chrome modern mac webkit desktop"]/div')
                for article in articlesList:
                    time.sleep(3)
                    try:
                        # 获取文章URL
                        ArticleUrl = article.find_element_by_tag_name('a').get_attribute('href')
                        print(ArticleUrl)
                    except:
                        err_appear = True
                        break
                    # 获取文章ID
                    ArticleID = ArticleUrl.split('/')[4]
                    # 打开子浏览器
                    self.driver_son.get(ArticleUrl)
                    time.sleep(1)
                    try:
                        # 获取文章发表时间和文章阅读量和获赞数
                        ArticleInfo = self.driver_son.find_element_by_xpath('//*[@class="post_desc"]').text
                        ArticleDate = ArticleInfo.split('   ')[1]
                        ArticleViews = ArticleInfo.split('   ')[2]
                        ArticleViews = ''.join(filter(str.isdigit, ArticleViews))
                        ArticleVotes = ArticleInfo.split('   ')[3]
                        ArticleVotes = ''.join(filter(str.isdigit, ArticleVotes))
                        # 获取文章内容
                        ArticleText = self.driver_son.find_element_by_class_name('event_content').text
                        ArticleText = ArticleText.replace('\n', '')
                        # 获取文章图片数量
                        ArticleImageNum = len(self.driver_son.find_elements_by_xpath('//*[@class="event_content"]//img'))
                        # 添加至列表
                        ArticleData.append([self.DoctorIdlist[i], self.DoctorNamelist[i], ArticleID, ArticleDate, ArticleImageNum, ArticleText, ArticleViews, ArticleVotes])
                        print(ArticleData)

                    except Exception as e:
                        continue

                # 跳转到下一页
                try:
                    page += 1
                    self.driver.get('https://www.ydl/experts/' + str(self.DoctorIdlist[i]) + '/jingyan/p' + str(page))
                    time.sleep(3)
                except Exception as e:
                    break
        # 导出csv
        Name = ['DocID', 'DocName', 'ArticleID', 'ArticleDate', 'ArticleImageNum', 'ArticleText', 'ArticleViews', 'ArticleVotes']
        df = pd.DataFrame(columns=Name, data=ArticleData)
        df.to_csv('data/doctor_article.csv')


if __name__ == '__main__':
    yidianling = YiDianLing()
    yidianling.AskDataCrawl()
    yidianling.ArticleDataCrawl()