煎蛋网妹子图爬虫

自从开了自己的站点，就好久没来写过CSDN的博客了。维护自己那个站点的功夫确实也画的比较多，所以这里也就没怎么更新了。

由于自己也是大四了，需要考雅思出国，能留给自己兴趣爱好的时间就更加的少了。现在雅思也考完了，学校的水课还没有开始考，处于这种大四退休生活中的我终于又有时间来写博客了。

#coding:utf8
import os, sys
import md5
import time, random
import requests
import urllib, urllib2
from termcolor import colored
import multiprocessing
from bs4 import BeautifulSoup

def for_one_page_test( url ):

    #opener = urllib2.build_opener(urllib2.ProxyHandler({'http':'218.244.149.184:8888'}), urllib2.HTTPHandler(debuglevel=1))
    #urllib2.install_opener(opener)

    #opener = urllib2.build_opener(urllib2.ProxyHandler({'http':'80.242.171.35:8888'}), urllib2.HTTPHandler(debuglevel=1))
    #urllib2.install_opener(opener)

    UA = "Mozilla/"+ str(random.randint(10, 100))  +".0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090625 Firefox/3.5"
    print UA

    i_headers = {"User-Agent": "Mozilla/8.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5","Referer": 'http://jiandan'}
        req = urllib2.Request(url, headers=i_headers)
    html = urllib2.urlopen(req).read()
    print colored( html , 'blue')
    soup = BeautifulSoup(html)

    for img_address in soup.find_all('a'):
        if ( isinstance(img_address.get('class'), (list,str) ) and img_address.get('class')[0] == "view_img_link" ):
            try:            
                img_url = img_address.get('href')
                                print colored( img_url[2:] , 'green')
                image_downlode( img_url )

                        except Exception, e:  
                                print Exception, ":", e


def image_downlode( url ):
        img = requests.get( 'http://' + url[2:] )
        name = get_name(url) + '.' + url[-3:] 
    try:
        open('/home/elfsong/image/'+name,'wb').write(img._content)
        print ( name + " done!")
    except Exception, e:
                print Exception, ":", e
        print ( name + " flased!")
    pass

def get_name( url ):
    m = md5.new()
    m.update( url )
    return m.hexdigest()


if __name__ == "__main__":
    start = *起始爬取页码*
    end = *结尾爬取页码*        
    pool = multiprocessing.Pool(processes = *使用进程数量*)

    btime = time.time()

    for page in range(start,end+1):
        url = "http://jandan/ooxx/page-" + str(page)
        pool.apply_async(for_one_page_test, (url, ) )

    pool.close()
    pool.join()

    etime = time.time()

    print (etime - btime)