运行环境:
python 3.6
scrapy 1.5.1
pymysql 0.9.2
IDE:pycharm
一、新建项目
命令行运行:
scrapy startproject douban
如下图所示:
产生的目录和文件如下图所示
二、创建爬虫模板
进到项目目录下,运行:
scrapy genspider douban_spider movie.douban
在应用目录spiders下产生一个名为douban_spider文件
三、制作爬虫
对items.py进行设置
#serial_number,movie_name,introduce,star,evaluate,describe要与数据库表的字段名对应
# 序号
serial_number = scrapy.Field()
# 电影名称
movie_name = scrapy.Field()
# 电影介绍
introduce = scrapy.Field()
# 电影评级
star = scrapy.Field()
# 电影评论数
evaluate = scrapy.Field()
# 电影描述
describe = scrapy.Field()
命令行方式启动爬虫
scrapy crawl douban_spider
设置默认user-agent头
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
为方便测试,把启动爬虫的命令写到python文件main.py里
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from scrapy import cmdline
cmdline.execute('scrapy crawl douban_spider'.split())
对爬取到的内容进行分析处理
# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem
class DoubanSpiderSpider(scrapy.Spider):
# 爬虫名称
name = 'douban_spider'
# 爬取域名范围
allowed_domains = ['movie.douban']
# 入口url
start_urls = ['https://movie.douban/top250']
# 默认解析方法
def parse(self, response):
# print(response.text)
# 循环电影的条目
movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li")
for i_item in movie_list:
# items文件导进来
douban_item = DoubanItem()
# 详细的xpath,进行数据的解析
douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first()
douban_item['movie_name'] = i_item.xpath(
".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first()
content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract()
# 多行数据处理
for i_content in content:
content_s = "".join(i_content.split())
douban_item['introduce'] = content_s
douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first()
douban_item['evaluate'] = i_item.xpath(".//div['star']/span[4]/text()").extract_first()
douban_item['describe'] = i_item.xpath(".//p[@class='quote']/span/text()").extract_first()
# 将数据yield到pipelines里去进行数据处理
yield douban_item
# 解析下一页规则,取得下一页的xpath
next_link = response.xpath("//span[@class='next']/link/@href").extract()
if next_link:
next_link = next_link[0]
yield scrapy.Request(self.start_urls[0] + next_link, callback=self.parse)
四、存储数据
存储为json
存储为csv
存储为mysql
mysql参数配置,在settings.py文件设置
# mysql参数配置
MYSQL_DATABASE = {
'hostname': 'localhost', # 服务器地址
'hostpost': 3306, # 端口
'username': 'root', # 用户名
'password': 'admin123', # 密码
'database': 'douban', # 数据库名
'charset': 'utf8', # 数据库编码
}
数据入库操作
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy/en/latest/topics/item-pipeline.html
import pymysql.cursors
from douban.settings import MYSQL_DATABASE
class DoubanPipeline(object):
def __init__(self):
# 数据库初始化配置
self.connect = pymysql.Connect(
host=MYSQL_DATABASE['hostname'],
port=MYSQL_DATABASE['hostpost'],
user=MYSQL_DATABASE['username'],
password=MYSQL_DATABASE['password'],
db=MYSQL_DATABASE['database'],
charset=MYSQL_DATABASE['charset'],
)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
data = dict(item)
serial_number = '%d' % int(data['serial_number'])
movie_name = data['movie_name']
introduce = data['introduce']
star = '%.1f' % float(data['star'])
evaluate = data['evaluate']
describe = data['describe']
insert_sql = "insert into `douban_movie`(`serial_number`,`movie_name`,`introduce`,`star`,`evaluate`,`describe`) values(%s,%s,%s,%s,%s,%s)"
self.cursor.execute(insert_sql, (serial_number, movie_name, introduce, star, evaluate, describe))
return item
开始item_pipelines,在settings.py文件里找到如下选项,如不设置此项,则数据不会添加到数据库里
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline': 300,
}
五、伪装处理
伪装的目的是为防止目标阻止爬取,伪装就是为了提高爬取成功率
两种伪装方式:代理和随机USER-AGENT
1.代理
具体代码如下:
# 代理处理
class MyProxy(object):
def process_request(self, request, spider):
# 蘑菇代理免费试用 www.moguproxy
request.meta['proxy'] = 'transfer.mogumiao:9001'
# 两种方式设置代理
# 第一种 appkey方式
appkey = 'd0ZBT2d5RlRZcG94Q2haMDpqajdZMXJqdEhCbnU0ZVFF'
request.headers['Authorization'] = "Basic " + appkey
# 第二种 账户密码方式
# proxy_name_pass = b'wFAOgyFTYpoxChZ0:jj7Y1rjtHBnu4eQE'
# 引入base64
# encode_pass_name = base64.b64encode(proxy_name_pass)
#
# request.headers['Authorization'] = "Basic " + encode_pass_name.decode()
2.随机USER_AGENT
具体代码如下:
# 设置随机agent
class MyUserAgent(object):
def process_request(self, request, spider):
# user agent 列表
USER_AGENT_LIST = [
'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]
# 导入random
agent = random.choice(USER_AGENT_LIST)
request.headers['User_Agent'] = agent
设置配置,开启代理和user-agent,在settings.py里设置
DOWNLOADER_MIDDLEWARES = {
# 'douban.middlewares.DoubanDownloaderMiddleware': 543,
'douban.middlewares.MyProxy': 543,
'douban.middlewares.MyUserAgent': 544,
}
转载于:https://my.oschina/u/3969821/blog/2052148
更多推荐
利用scrapy爬取豆瓣250
发布评论