新建爬虫项目

conda activate Scrapy
scrapy startproject wxapp
cd wxapp
#这里要选择创建crawl类型爬虫
scrapy genspider -t crawl wxapp_spider wxapp-union

确认爬取内容

import scrapy

class WxappItem(scrapy.Item):

    title = scrapy.Field()
    author = scrapy.Field()
    time = scrapy.Field()
    mh_content = scrapy.Field()

编写爬虫

切记Rule里面回调函数不能为parse!!!

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
#导入items包
from wxapp.items import WxappItem

class WxappSpiderSpider(CrawlSpider):
    name = 'wxapp_spider'
    ##域名限制
    allowed_domains = ['wxapp-union']
    #起始url
    start_urls = ['http://www.wxapp-union/portal.php?mod=list&catid=2&page=1']
	#url筛选规则,正则表达式
    rules = (
    	#在页面中筛选下一页url链接,follow=Ture即在访问符合规则的url页面后,继续在页面查找符合规则的url,这里不需要调用解析函数
        Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
        #在页面中筛选详情页链接,需要调用解析函数,在详情页不用再筛选
        Rule(LinkExtractor(allow=r'.+/article-.+\.html'), callback="parse_detail", follow=False),
    )
    
    #解析函数
    def parse_detail(self,response):
		#筛选标题、作者、时间
        title = response.xpath("//h1[@class='ph']/text()").extract()[0]
        author_time = response.xpath('//p[@class="authors"]')
        author = author_time.xpath('./a/text()').extract()[0]
        time = author_time.xpath('.//span/text()').extract()[0]
        print(title,author,time)
        #内容筛选需要整理一下
        contents = response.xpath('//td[@id="article_content"]//text()').extract()
        # print(content)
        mh_content=''
        for content in contents:
            if content is not None:
                mh_content += str(content).strip()
        print(mh_content)
		
        item = WxappItem(title=title,author=author,time=time,mh_content=mh_content)
        yield item

存储数据

from scrapy.exporters import JsonLinesItemExporter

class WxappPipeline(object):
    def __init__(self):
        self.fp = open('wxapp.json','wb')
        self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self):
        self.fp.close()

修改机器人协议、打开pipelines、设置等待、添加User-Agent

新建文件实现pycharm运行scrapy

更多推荐

Python学习Scrapy利用crawlspider实现按照规则自动筛选下一页链接、标题链接