自学Python九 爬虫实战二(美图福利)

python

  作为一个新世纪有思想有文化有道德时刻准备着的屌丝男青年,在现在这样一个社会中,心疼我大慢播抵制大百度的前提下,没事儿上上网逛逛YY看看斗鱼翻翻美女图片那是必不可少的,可是美图虽多翻页费劲!今天我们就搞个爬虫把美图都给扒下来!本次实例有2个:煎蛋上的妹子图,某网站的rosi图。我只是一个学习python的菜鸟,技术不可耻,技术是无罪的!!!

  煎蛋:

  先说说程序的流程:获取煎蛋妹子图URL,得到网页代码,提取妹子图片地址,访问图片地址并将图片保存到本地。Ready? 先让我们看看煎蛋妹子网页:

  我们得到URL为:http://jandan.net/ooxx/page-1764#comments     1764就是页码, 首先我们要得到最新的页码,然后向前寻找,然后得到每页中图片的url。下面我们分析网站代码写出正则表达式!

  根据之前文章的方法我们写出如下函数getNewPage:

1 def __getNewPage(self):

2 pageCode = self.Get(self.__Url)

3 type = sys.getfilesystemencoding()

4 pattern = re.compile(r\'<div .*?cp-pagenavi">.*?<span .*?current-comment-page">\[(.*?)\]</span>\',re.S)

5 newPage = re.search(pattern,pageCode.decode("UTF-8").encode(type))

6 print pageCode.decode("UTF-8").encode(type)

7 if newPage != None:

8 return newPage.group(1)

9 return 1500

  不要问我为什么如果失败返回1500。。。 因为煎蛋把1500页之前的图片都给吃了。 你也可以返回0。接下来是图片的

1 def __getAllPicUrl(self,pageIndex):

2 realurl = self.__Url + "page-" + str(pageIndex) + "#comments"

3 pageCode = self.Get(realurl)

4 type = sys.getfilesystemencoding()

5 pattern = re.compile(\'<p>.*?<a .*?view_img_link">.*?</a>.*?<img src="(.*?)".*?</p>\',re.S)

6 items = re.findall(pattern,pageCode.decode("UTF-8").encode(type))

7 for item in items:

8 print item

  好了,得到了图片地址,接下来就是访问图片地址然后保存图片了:

1     def __savePics(self,img_addr,folder):

2 for item in img_addr:

3 filename = item.split(\'/\')[-1]

4 print "正在保存图片:" + filename

5 with open(filename,\'wb\') as file:

6 img = self.Get(item)

7 file.write(img)

  当你觉得信心满满的时候,一定会有一盆冷水浇到你的头上,毕竟程序就是这样,考验你的耐性,打磨你的自信。你测试了一会儿,然后你发现你重启程序后再也无法获取最新页码,你觉得我什么也没动啊为什么会这样。别着急,我们将得到的网页代码打印出来看看:

  看到了吧,是服务器感觉你不像浏览器访问的结果把你的ip给屏蔽了。    真是给跪了,辛辛苦苦码一年,屏蔽回到解放前!那么这个如何解决呢,答:换ip 找代理。接下来我们要改一下我们的HttpClient.py 将里面的opener设置下代理服务器。具体代理服务器请自行百度之,关键字:http代理 。 想找到一个合适的代理也不容易 自己ie Internet选项挨个试试,测试下网速。

 1 # -*- coding: utf-8 -*-

2 import cookielib, urllib, urllib2, socket

3 import zlib,StringIO

4 class HttpClient:

5 __cookie = cookielib.CookieJar()

6 __proxy_handler = urllib2.ProxyHandler({"http" : \'42.121.6.80:8080\'})#设置代理服务器与端口

7 __req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie),__proxy_handler)#生成opener

8 __req.addheaders = [

9 (\'Accept\', \'application/javascript, */*;q=0.8\'),

10 (\'User-Agent\', \'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)\')

11 ]

12 urllib2.install_opener(__req)

13

14 def Get(self, url, refer=None):

15 try:

16 req = urllib2.Request(url)

17 #req.add_header(\'Accept-encoding\', \'gzip\')

18 if not (refer is None):

19 req.add_header(\'Referer\', refer)

20 response = urllib2.urlopen(req, timeout=120)

21 html = response.read()

22 #gzipped = response.headers.get(\'Content-Encoding\')

23 #if gzipped:

24 # html = zlib.decompress(html, 16+zlib.MAX_WBITS)

25 return html

26 except urllib2.HTTPError, e:

27 return e.read()

28 except socket.timeout, e:

29 return \'\'

30 except socket.error, e:

31 return \'\'

  然后,就可以非常愉快的查看图片了。不过用了代理速度好慢。。。可以设置timeout稍微长一点儿,防止图片下载不下来!

  好了,rosi的下篇文章再放!现在是时候上一波代码了:

 1 # -*- coding: utf-8 -*-

2 import cookielib, urllib, urllib2, socket

3 import zlib,StringIO

4 class HttpClient:

5 __cookie = cookielib.CookieJar()

6 __proxy_handler = urllib2.ProxyHandler({"http" : \'42.121.6.80:8080\'})

7 __req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie),__proxy_handler)

8 __req.addheaders = [

9 (\'Accept\', \'application/javascript, */*;q=0.8\'),

10 (\'User-Agent\', \'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)\')

11 ]

12 urllib2.install_opener(__req)

13

14 def Get(self, url, refer=None):

15 try:

16 req = urllib2.Request(url)

17 req.add_header(\'Accept-encoding\', \'gzip\')

18 if not (refer is None):

19 req.add_header(\'Referer\', refer)

20 response = urllib2.urlopen(req, timeout=120)

21 html = response.read()

22 gzipped = response.headers.get(\'Content-Encoding\')

23 if gzipped:

24 html = zlib.decompress(html, 16+zlib.MAX_WBITS)

25 return html

26 except urllib2.HTTPError, e:

27 return e.read()

28 except socket.timeout, e:

29 return \'\'

30 except socket.error, e:

31 return \'\'

32

33 def Post(self, url, data, refer=None):

34 try:

35 #req = urllib2.Request(url, urllib.urlencode(data))

36 req = urllib2.Request(url,data)

37 if not (refer is None):

38 req.add_header(\'Referer\', refer)

39 return urllib2.urlopen(req, timeout=120).read()

40 except urllib2.HTTPError, e:

41 return e.read()

42 except socket.timeout, e:

43 return \'\'

44 except socket.error, e:

45 return \'\'

46

47 def Download(self, url, file):

48 output = open(file, \'wb\')

49 output.write(urllib2.urlopen(url).read())

50 output.close()

51

52 # def urlencode(self, data):

53 # return urllib.quote(data)

54

55 def getCookie(self, key):

56 for c in self.__cookie:

57 if c.name == key:

58 return c.value

59 return \'\'

60

61 def setCookie(self, key, val, domain):

62 ck = cookielib.Cookie(version=0, name=key, value=val, port=None, port_specified=False, domain=domain, domain_specified=False, domain_initial_dot=False, path=\'/\', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={\'HttpOnly\': None}, rfc2109=False)

63 self.__cookie.set_cookie(ck)

64 #self.__cookie.clear() clean cookie

65 # vim : tabstop=2 shiftwidth=2 softtabstop=2 expandtab

HttpClient

 1 # -*- coding: utf-8 -*-

2 from __future__ import unicode_literals

3 from HttpClient import HttpClient

4 import sys,re,os

5 class JianDan(HttpClient):

6 def __init__(self):

7 self.__pageIndex = 1500 #之前的图片被煎蛋吞了

8 self.__Url = "http://jandan.net/ooxx/"

9 self.__floder = "jiandan"

10 def __getAllPicUrl(self,pageIndex):

11 realurl = self.__Url + "page-" + str(pageIndex) + "#comments"

12 pageCode = self.Get(realurl)

13 type = sys.getfilesystemencoding()

14 pattern = re.compile(\'<p>.*?<a .*?view_img_link">.*?</a>.*?<img src="(.*?)".*?</p>\',re.S)

15 items = re.findall(pattern,pageCode.decode("UTF-8").encode(type))

16 for item in items:

17 print item

18 self.__savePics(items,self.__floder)

19

20 def __savePics(self,img_addr,folder):

21 for item in img_addr:

22 filename = item.split(\'/\')[-1]

23 print "正在保存图片:" + filename

24 with open(filename,\'wb\') as file:

25 img = self.Get(item)

26 file.write(img)

27

28 def __getNewPage(self):

29 pageCode = self.Get(self.__Url)

30 type = sys.getfilesystemencoding()

31 pattern = re.compile(r\'<div .*?cp-pagenavi">.*?<span .*?current-comment-page">\[(.*?)\]</span>\',re.S)

32 newPage = re.search(pattern,pageCode.decode("UTF-8").encode(type))

33 print pageCode.decode("UTF-8").encode(type)

34 if newPage != None:

35 return newPage.group(1)

36 return 1500

37

38 def start(self):

39 isExists=os.path.exists(self.__floder)#检测是否存在目录

40 print isExists

41 if not isExists:

42 os.mkdir(self.__floder)

43 os.chdir(self.__floder)

44 page = int(self.__getNewPage())

45 for i in range(self.__pageIndex,page):

46 self.__getAllPicUrl(i)

47

48 if __name__ == \'__main__\':

49 jd = JianDan()

50 jd.start()

JianDan

以上是 自学Python九 爬虫实战二(美图福利) 的全部内容, 来源链接: utcz.com/z/386787.html

回到顶部