文章目录
- 一、环境准备
- 二、咨询师信息爬取
- 三、问答数据的采集
一、环境准备
- 代码环境:python3.6
- chrome环境:在chrome浏览器中敲入chrome://version。前往链接下载对应版本的chormedriver.exe。
- 所需第三方库:Beautifulsoup4、selenium、time、pandas(读写csv)
二、咨询师信息爬取
网址:网址
目标:爬取共205咨询师的ID和姓名。
分析:姓名就不多解释,主要解释ID。打开开发者工具,随便定位一个咨询师。可以定位其在html中的标签。我们只需要解析出href的数据,抽取其中的数字即可。
由于采集咨询师的ID和姓名并不需要点击进入咨询师的详情界面,因此这里不需要使用webdriver,可以使用bs4等方式解析网页获取数据,这样速度更快,编写代码也更简单!完整代码如下:
#!/usr/bin/env python
# encoding=utf-8
"""
Author:YJY
Need:爬取壹点零网站的咨询师列表LIST
"""
import codecs
import requests
from bs4 import BeautifulSoup
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
DOWNLOAD_URL = 'https://ydl/experts/'
def download_page(url):
return requests.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}).content
def parse_html(html, list):
# print("parse_html执行一次")
try:
soup = BeautifulSoup(html)
doctor_list_soup = soup.find('div', attrs={'class': 'expertsList_items'})
# find_all获取该页面的所有咨询师信息。每个咨询师完整信息包含在<item>标签下。
for doctor_item in doctor_list_soup.find_all('div', attrs={'class': 'item'}):
doctor_info = doctor_item.find('div', attrs={'class': 'info'})
doctor_information = doctor_info.find('h3').find('a')
# 获取姓名
doctor_name = doctor_information.getText()
# 获取id
doctor_id = doctor_information['href']
doctor_id = ''.join(filter(lambda i: i in ['#'] or i.isdigit(), doctor_id))
# 将姓名和ID添加至咨询师信息列表
doctor_information_list.append([doctor_id, doctor_name.replace('\n','').replace(' ','')])
# 判断下一页是否存在
next_page = soup.find('li', attrs={'class': 'next'}).find('a')
print(next_page['href'])
if next_page:
#下一页存在,则返回当前的咨询师列表信息和下一页的url
return doctor_information_list, DOWNLOAD_URL + next_page['href']
# 下一页不存在,则返回当前的咨询师列表,url设置为None终止while循环
return doctor_information_list, None
# 对采集的异常进行简单处理
except Exception as e:
return doctor_information_list, DOWNLOAD_URL + next_page['href']
def main():
url = DOWNLOAD_URL
page = 0
doctors = []
try:
while url:
# 打印当前正在打印的页数(前两次的抓取结果均为第一页,没有太仔细想这个bug,不过问题不大)
print("正在打印第%d页" % page)
page += 1
# 解析函数
try:
html = download_page(url)
doctors, url = parse_html(html, doctors)
except Exception as e:
# 第102页会异常(具体原因没有细想),所以遇到异常时手动设置下一页URL
html = download_page(DOWNLOAD_URL + '/experts?experts=&page=' + str(page))
# 简单处理异常
except Exception as e:
print(e)
# 将列表转换为DataFrame,导出至csv文件
name = ['DocID', 'DocName']
doctor_information = pd.DataFrame(columns=name, data=doctors)
doctor_information.to_csv('data/doctor_information.csv', encoding='utf-8', index=None)
if __name__ == '__main__':
main()
这样我们就得到了所有咨询师的ID和姓名信息,便于后续数据的采集。代码写的比较匆忙,还可以有很多修改,仅仅为了实现功能所以糙了点,异常处理都很随意哈哈哈哈,不要太纠结代码。
三、问答数据的采集
下面开始代码编写:
放在前面,对于xpath和selenium、webdriver不熟悉的朋友参考以下教程:
- xpath:xpath教程
- webdriver:selenium+webdriver教程
首先,导入相关包
import pandas as pd
import time
from selenium import webdriver
设置YiDianLing类,包含五个属性和两个函数。
class YiDianLing(object):
def __init__(self):
self.url = "https://ydl/experts/"
# 创建浏览器
self.driver = webdriver.Chrome(r"chromedriver.exe")
# 创建子浏览器(因为webdriver返回前一页,可能会发生数据丢失,因此我们直接采用子浏览器打开咨询师的具体问答,主浏览器仍保持咨询师问答列表,这里有很多坑,大家需要小心!)
self.driver_son = webdriver.Chrome(r"chromedriver.exe")
# 读取咨询师的ID和姓名
self.DoctorIdlist = pd.read_csv("data/doctor_information.csv")['DocID']
self.DoctorNamelist = pd.read_csv("data/doctor_information.csv")['DocName']
# 爬取问答数据
def AskDataCrawl(self):
# 爬取文章数据(不多说明,代码在最后)
def ArticleDataCrawl(self):
创建问答数据列表(最后将该列表转换为DataFrame,导出csv)。遍历前面爬取的咨询师信息文件,遍历每个咨询师的主页。
# 创建问答数据列表
ArticleData = []
# 遍历咨询师列表
for i in range(len(self.DoctorIdlist)):
# 访问第i+1个咨询师的主界面
doctorurl = self.url + str(self.DoctorIdlist[i])
self.driver.get(doctorurl)
time.sleep(1)
点击页面中的“查看所有”,跳转到问答界面。
# 跳转到问答板快(查看所有)
self.driver.find_element_by_link_text('查看全部').click()
# 关闭旧界面
self.driver.switch_to.window(self.driver.window_handles[0])
self.driver.close()
# 设置当前页面为最后一个界面
self.driver.switch_to.window(self.driver.window_handles[-1])
time.sleep(1)
遍历当前界面的所有问答数据
# 获取所有问答
questionsList = self.driver.find_elements_by_class_name('item')
for question in questionsList:
time.sleep(3)
try:# 获取问题URL
QuestionUrl = question.find_element_by_tag_name('a').get_attribute('href')
print(QuestionUrl)
# 获取问题ID和咨询师回复ID
QuestionID = QuestionUrl.split('/')[4]
AnswerID = QuestionID + str(self.DoctorIdlist[i])
# 打开子浏览器
self.driver_son.get(QuestionUrl)
time.sleep(1)
# 获取问题内容
QuestionText = self.driver_son.find_element_by_xpath('//*[@class="content"]/p').text
QuestionText = QuestionText.replace('\n', '')
# 获取提问时间
QuestionDate = self.driver_son.find_element_by_class_name('ask_right').text.split('\n')
QuestionDate = QuestionDate[0]
# 咨询师回复时间
AnswerDate = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/time').text
# 获取咨询师回复内容
AnswerText = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/p').text
AnswerText = AnswerText.replace('\n', '')
# 获取咨询师回复收到的感谢数量
AnswerThanks = self.driver_son.find_element_by_xpath('//*[@class="votable"]/a/font').text
# 添加至列表
AskData.append([self.DoctorIdlist[i], self.DoctorNamelist[i], QuestionID, QuestionText, QuestionDate, AnswerID, AnswerDate, AnswerText, AnswerThanks])
except Exception as e:
continue
跳转至下一页,直至无法跳转时退出循环,遍历下一个咨询师的主页,重复上述循环。
# 跳转到下一页
try:
page += 1
self.driver.get('https://ydl/experts/' + str(self.DoctorIdlist[i]) + '/answerList/p' + str(page))
time.sleep(3)
except Exception as e:
break
完整代码如下,包括问大数据获取和文章数据的获取。代码编写比较匆忙,或多或少有些小问题,还请大家见谅!
#!/usr/bin/env python
# encoding=utf-8
"""
Author:YJY
Need:爬取壹点零网站的咨询师问答数据和文章数据
"""
import pandas as pd
import time
from selenium import webdriver
class YiDianLing(object):
def __init__(self):
self.url = "https://ydl/experts/"
# 创建浏览器
self.driver = webdriver.Chrome(r"chromedriver.exe")
self.driver_son = webdriver.Chrome(r"chromedriver.exe")
self.DoctorIdlist = pd.read_csv("data/doctor_information.csv")['DocID']
self.DoctorNamelist = pd.read_csv("data/doctor_information.csv")['DocName']
def AskDataCrawl(self):
# 创建问答数据列表
AskData = []
# 遍历咨询师列表
nums = 1
for i in range(len(self.DoctorIdlist)):
# 访问第i个咨询师的主界面
doctorurl = self.url + str(self.DoctorIdlist[i])
self.driver.get(doctorurl)
time.sleep(1)
# 跳转到问答板快(查看所有)
self.driver.find_element_by_link_text('查看全部').click()
self.driver.switch_to.window(self.driver.window_handles[0])
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[-1])
time.sleep(1)
# 问题列表界面page
page = 1
while True:
# 获取所有问答
questionsList = self.driver.find_elements_by_class_name('item')
for question in questionsList:
time.sleep(3)
try:
# 获取问题URL
QuestionUrl = question.find_element_by_tag_name('a').get_attribute('href')
print(QuestionUrl)
# 获取问题ID和咨询师回复ID
QuestionID = QuestionUrl.split('/')[4]
AnswerID = QuestionID + str(self.DoctorIdlist[i])
# 打开子浏览器
self.driver_son.get(QuestionUrl)
time.sleep(1)
# 获取问题内容
QuestionText = self.driver_son.find_element_by_xpath('//*[@class="content"]/p').text
QuestionText = QuestionText.replace('\n', '')
# 获取提问时间
QuestionDate = self.driver_son.find_element_by_class_name('ask_right').text.split('\n')
QuestionDate = QuestionDate[0]
# 咨询师回复时间
AnswerDate = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/time').text
# 获取咨询师回复内容
AnswerText = self.driver_son.find_element_by_xpath('//*[@class="text first-lever"]/p').text
AnswerText = AnswerText.replace('\n', '')
# 获取咨询师回复收到的感谢数量
AnswerThanks = self.driver_son.find_element_by_xpath('//*[@class="votable"]/a/font').text
# 添加至列表
AskData.append([self.DoctorIdlist[i], self.DoctorNamelist[i], QuestionID, QuestionText, QuestionDate, AnswerID, AnswerDate, AnswerText, AnswerThanks])
print(AskData)
# self.driver_son.close()
except Exception as e:
continue
# 跳转到下一页
try:
page += 1
self.driver.get('https://ydl/experts/' + str(self.DoctorIdlist[i]) + '/answerList/p' + str(page))
time.sleep(3)
except Exception as e:
break
nums += 1
# 导出csv
Name = ['DocID', 'DocName', 'QuestionID', 'QuestionText', 'QuestionDate', 'AnswerID', 'AnswerDate', 'AnswerText', 'AnswerThanks']
df = pd.DataFrame(columns=Name, data=AskData)
df.to_csv('data/doctor_ask.csv')
def ArticleDataCrawl(self):
# 创建问答数据列表
ArticleData = []
# 遍历咨询师列表
for i in range(len(self.DoctorIdlist)):
# 访问第i+1个咨询师的主界面
doctorurl = self.url + str(self.DoctorIdlist[i])
self.driver.get(doctorurl)
time.sleep(1)
# 判断是否有文章,有则进入该板块
try:
if '文章' in self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[5]').text:
self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[5]').click()
self.driver.switch_to.window(self.driver.window_handles[0])
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[-1])
time.sleep(1)
elif '文章' in self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[6]').text:
self.driver.find_element_by_xpath('//*[@class="content-nav"]/li[6]').click()
self.driver.switch_to.window(self.driver.window_handles[0])
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[-1])
time.sleep(1)
else:
continue
except Exception as e:
continue
# 文章列表界面page
page = 1
err_appear = False
while not err_appear:
articlesList = self.driver.find_elements_by_xpath('//*[@class="ui-content testings index testings_index chrome modern mac webkit desktop"]/div')
for article in articlesList:
time.sleep(3)
try:
# 获取文章URL
ArticleUrl = article.find_element_by_tag_name('a').get_attribute('href')
print(ArticleUrl)
except:
err_appear = True
break
# 获取文章ID
ArticleID = ArticleUrl.split('/')[4]
# 打开子浏览器
self.driver_son.get(ArticleUrl)
time.sleep(1)
try:
# 获取文章发表时间和文章阅读量和获赞数
ArticleInfo = self.driver_son.find_element_by_xpath('//*[@class="post_desc"]').text
ArticleDate = ArticleInfo.split(' ')[1]
ArticleViews = ArticleInfo.split(' ')[2]
ArticleViews = ''.join(filter(str.isdigit, ArticleViews))
ArticleVotes = ArticleInfo.split(' ')[3]
ArticleVotes = ''.join(filter(str.isdigit, ArticleVotes))
# 获取文章内容
ArticleText = self.driver_son.find_element_by_class_name('event_content').text
ArticleText = ArticleText.replace('\n', '')
# 获取文章图片数量
ArticleImageNum = len(self.driver_son.find_elements_by_xpath('//*[@class="event_content"]//img'))
# 添加至列表
ArticleData.append([self.DoctorIdlist[i], self.DoctorNamelist[i], ArticleID, ArticleDate, ArticleImageNum, ArticleText, ArticleViews, ArticleVotes])
print(ArticleData)
except Exception as e:
continue
# 跳转到下一页
try:
page += 1
self.driver.get('https://www.ydl/experts/' + str(self.DoctorIdlist[i]) + '/jingyan/p' + str(page))
time.sleep(3)
except Exception as e:
break
# 导出csv
Name = ['DocID', 'DocName', 'ArticleID', 'ArticleDate', 'ArticleImageNum', 'ArticleText', 'ArticleViews', 'ArticleVotes']
df = pd.DataFrame(columns=Name, data=ArticleData)
df.to_csv('data/doctor_article.csv')
if __name__ == '__main__':
yidianling = YiDianLing()
yidianling.AskDataCrawl()
yidianling.ArticleDataCrawl()
更多推荐
selenium+webdriver 案例实战
发布评论