新建爬虫项目
conda activate Scrapy
scrapy startproject wxapp
cd wxapp
#这里要选择创建crawl类型爬虫
scrapy genspider -t crawl wxapp_spider wxapp-union
确认爬取内容
import scrapy
class WxappItem(scrapy.Item):
title = scrapy.Field()
author = scrapy.Field()
time = scrapy.Field()
mh_content = scrapy.Field()
编写爬虫
切记Rule里面回调函数不能为parse!!!
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
#导入items包
from wxapp.items import WxappItem
class WxappSpiderSpider(CrawlSpider):
name = 'wxapp_spider'
##域名限制
allowed_domains = ['wxapp-union']
#起始url
start_urls = ['http://www.wxapp-union/portal.php?mod=list&catid=2&page=1']
#url筛选规则,正则表达式
rules = (
#在页面中筛选下一页url链接,follow=Ture即在访问符合规则的url页面后,继续在页面查找符合规则的url,这里不需要调用解析函数
Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
#在页面中筛选详情页链接,需要调用解析函数,在详情页不用再筛选
Rule(LinkExtractor(allow=r'.+/article-.+\.html'), callback="parse_detail", follow=False),
)
#解析函数
def parse_detail(self,response):
#筛选标题、作者、时间
title = response.xpath("//h1[@class='ph']/text()").extract()[0]
author_time = response.xpath('//p[@class="authors"]')
author = author_time.xpath('./a/text()').extract()[0]
time = author_time.xpath('.//span/text()').extract()[0]
print(title,author,time)
#内容筛选需要整理一下
contents = response.xpath('//td[@id="article_content"]//text()').extract()
# print(content)
mh_content=''
for content in contents:
if content is not None:
mh_content += str(content).strip()
print(mh_content)
item = WxappItem(title=title,author=author,time=time,mh_content=mh_content)
yield item
存储数据
from scrapy.exporters import JsonLinesItemExporter
class WxappPipeline(object):
def __init__(self):
self.fp = open('wxapp.json','wb')
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self):
self.fp.close()
修改机器人协议、打开pipelines、设置等待、添加User-Agent
新建文件实现pycharm运行scrapy
更多推荐
Python学习Scrapy利用crawlspider实现按照规则自动筛选下一页链接、标题链接
发布评论