from selenium import webdriver
from lxml import etree
from time import sleep
import re
import random
import time
import requests

from selenium.webdrivermon.by import By

from get_user_agent import get_user_agent_of_pc
from selenium.webdrivermon.service import  Service

def roll_down(chrome):
    for x in range(1, 11,3):
        time.sleep(random.random())
        j = x / 10
        js = "document.documentElement.scroll=document.documentElement.scrollHeight*%s" % j
        chrome.execute_script(js)




def parse_html(html):
    tree = etree.HTML(html)
    names = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[3]/a/em')
    prices = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[2]/strong/i/text()')
    shops = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[5]/span/a/text()')
    hrefs = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[1]/a/@href')
    img_urls = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[1]/a/img/@src')
    comments = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[4]/strong/a/text()')

    for name, price, shop, href, img_url, commnet in zip(names, prices, shops, hrefs, img_urls, comments):
        with open('新.txt', "a+", encoding='utf-8') as f :
            s = re.sub('\s+','',name.xpath('string(.)'))
            s1= re.sub('\/+','-', s)#删除/
            s2 = re.sub('\|+','_', s1)
            s3 = re.sub('\?+','', s2)
            s4 = re.sub('\*+','', s3)
            s5 = re.sub('\\+','', s4)
            f.write("商品名:"+ s +'\t' +
            '价格:' + price + '\t' +
            '商品链接:' + 'https:' + href + '\t' +
            "卖家:" + shop + '\t' +
            "图片地址" + 'https://' + img_url + '\t' +
            "评论数" + commnet + '\n')

     #下载图片
        response = requests.get('https:' + img_url , headers = headers )
        with open(image/{}.jpg ,'wb') as f:
            f.write(response.content)



def JD_Spider(url):
    chrome_driver='F:\daolun\chromedriver.exe'
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('user-agent='+ get_user_agent_of_pc())
    options.add_argument('disable-infobars')
    options.add_experimental_option('excludeSwitches',['enalble-automation'])
    chrome = webdriver.Chrome(options=options, executable_path=chrome_driver)

    chrome.get(url)
    roll_down(chrome)
    print('正在爬取{}页...'.format(1))
    html=chrome.page_source
    parse_html(html)

    num = 1
    js = 'return document.getElementsByClassName("pn-next disabled").length'
    has_next = chrome.execute_script(js)  # 判断“下一页”按钮是否可用 has_next=0表示“下一页”按钮是可用,has_next=1表示“下一页”按钮不可用

    while has_next == 0:
        try:
            next_page_button = chrome.find_element(by=By.xpath,value='//a[@class="pn-next"]')
            next_page_button.click()
        except Exception as e :
            break

        num+=1
        print('正在爬取第{}页...'.format(num))

        roll_down(chrome)
        sleep(3 + random.random())


        next_html = chrome.page_source
        parse_html(next_html)
        js = 'return document.getElementsByClassName("pn-next disabled").length'
        has_next = chrome.execute_script(js)


if __name__=='__main__':
   headers = {
       "User-Agent": get_user_agent_of_pc()
   }

   first_page='https://search.jd/Search?keyword=macbook&enc=utf-8&wq=macbook&pvid=0798b177abbc445e9b25431224c3c63b'
   JD_Spider(first_page)

更多推荐

这个爬虫为什么只能爬一页,而且为啥显示image没有被定义