python Scrapy Selenium PhantomJS 爬取微博图片 - 大师兄^..^

python

python Scrapy Selenium PhantomJS 爬取微博图片

1,创建项目

scrapy startproject weibo #创建工程

scrapy genspider -t basic weibo.com weibo.com #创建spider

  

目录结构

定义Items 

编辑items.py

import scrapy

class WeiboItem(scrapy.Item):

# define the fields for your item here like:

image_urls = scrapy.Field()

dirname = scrapy.Field()

 

编辑pipelines.py

  

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don\'t forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import hashlib

# from scrapy.contrib.pipeline.images import ImagesPipeline

# from scrapy.http import Request

from scrapy.utils.python import to_bytes

import urllib

import os

import redis

#Scrapy 自带图片下载器,对gif动图支持不好,原因没找,所以为支持gif动图,自定义下载器

# class WeboPipeline(ImagesPipeline):

# # def process_item(self, item, spider):

# # return item

# def get_media_requests(self, item, info):

# for image_url in item[\'image_urls\']:

# request = Request(\'http:\'+image_url)

# request.meta[\'item\'] = {\'dirname\':item[\'dirname\']}

# yield request

#定义存储目录和文件扩展名

# def file_path(self, request, response=None, info=None):

# url = request.url

# item = request.meta[\'item\']

# image_guid = hashlib.sha1(to_bytes(url)).hexdigest()

# ext = url.split(\'.\')[-1]

# url = \'full/%s/%s.%s\' % (item[\'dirname\'],image_guid,ext)

# return url

# def item_completed(self, results, item, info):

# return item

#没有用原生的Scrapy文件下载功能,因为gif图片下载不全,不能动

class WeboPipeline(object):

  #用redis判重,简单粗暴了点,

def open_spider(self,spider):

self.client=redis.Redis(host=\'127.0.0.1\', port=6379)

def process_item(self, item, spider):

file_path = item[\'dirname\']

#redis判重复

yn = self.client.get(file_path)

if yn is not None:

print \'已经下载过\'

return item

for image_url in item[\'image_urls\']:

imageurl = \'http:\'+image_url

savepath = self.get_file_path(file_path,imageurl)

print imageurl,savepath

try:

          #下载图片到指定地址

urllib.urlretrieve(imageurl,savepath)

except Exception as e:

print str(e)

#根据微博内容中每组图片地址的hash做唯一标识

self.client.set(file_path,1)

return item

def get_file_path(self,dirname, url):

     ```

      获取新的文件名

     ```

image_guid = hashlib.sha1(to_bytes(url)).hexdigest()

ext = url.split(\'.\')[-1]

     #文件存储目录写死了,可在setting中设置

file_dir = \'./full/%s\'%(dirname)

if os.path.exists(file_dir) == False:

os.makedirs(file_dir)

return \'%s/%s.%s\' % (file_dir,image_guid,ext)

  

编写爬虫

spiders/weibo_com.py

# -*- coding: utf-8 -*-

import scrapy

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

import json

import os

import time

from collections import defaultdict

from scrapy.selector import Selector

import re

import urllib

from weibo.items import WeiboItem

import time

import hashlib

class WebComSpider(scrapy.Spider):

name = \'weibo.com\'

allowed_domains = [\'weibo.com\']

start_urls = [\'https://weibo.com/\']

   #cookie存储地址

cookie_file_path = \'./cookies.json\'

#要抓取博主的ID

uids = [\'5139152205\']

def saveCookie(self,cookie):

     ```保存cookie到文件```

with open(self.cookie_file_path,\'w\') as outputfile:

json.dump(cookie, outputfile)

def getCookie(self):

     ```从文件中获取cookie```

if os.path.exists(self.cookie_file_path) == False:

self.cookie = None

return

with open(self.cookie_file_path,\'r\') as inputfile:

self.cookie = json.load(inputfile)

def start_requests(self):

     ```抓取微博```

self.getCookie()

     #如果没有cookie,模拟登陆获取cookie

if self.cookie is None:

       #用PhantomJS模拟浏览器

driver = webdriver.PhantomJS(executable_path=\'/data/software/phantomjs-2.1.1-linux-x86_64/bin/phantomjs\')

driver.get(\'https://weibo.com\')

try:

          #设置窗口大小,这个很重要,如果不设置,获取不到下面的元素

driver.set_window_size(1920,1080)

          #等待获取到 loginname 对象后才会进行下一步操作

userElen = WebDriverWait(driver,10).until(

EC.presence_of_element_located((By.ID,\'loginname\'))

)

          #等待时间,反扒

time.sleep(3);

print \'sleep 3\'

          #设置登陆用户名

userElen.send_keys(\'登陆用户名\')

print \'sleep 5\'

time.sleep(5)

pasElen = driver.find_element_by_xpath(\'//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input\')

          #设置登陆密码

pasElen.send_keys(\'登陆密码\')

print \'sleep 1\'

time.sleep(1)

sumbButton = driver.find_element_by_xpath(\'//*[@id="pl_login_form"]/div/div[3]/div[6]/a\')

          #登陆

sumbButton.click()

#当页面title中包含我的首页之后,表示登陆成功,绩效下一步

element = WebDriverWait(driver,10).until(

EC.title_contains(u\'我的首页\')

)

except Exception as e:

print \'22222222222222222\',str(e)

       #获取cookie

ck = driver.get_cookies()

self.cookie = defaultdict()

for item in ck:

self.cookie[item[\'name\']] = item[\'value\']

       #保存cookie

self.saveCookie(self.cookie)

     #用得到的cookie抓取你想要的博主的图片微博

for uid in self.uids:

       #博主带图片的微博列表,根据具体需求自己定义

url = \'https://weibo.com/u/%s?profile_ftype=1&is_pic=1#_0\' %(uid,)

request = scrapy.Request(url=url,cookies=self.cookie,callback=self.parse)

request.meta[\'item\'] = {\'uid\':uid,\'page_num\':1}

yield request

  

def parse(self,response):

     ```

      解析页面,因为微博采用页面跳转和Ajax两种翻页模式,每次页面跳转之后都会有两次Ajax请求获取数据,

微博页面全部由js渲染html字符生成,所以没法用xpath,css选择器,只能采用正则查找方法找到自己想要的内容

     ```

title = response.xpath(\'//title/text()\').extract()[0]

print title

seletor = Selector(text=response.body)

     #获取Ajax请求参数

pageId = seletor.re(r"\$CONFIG\[\\'page_id\\'\]=\\'(\d+)\\'")[0]

domain = seletor.re(r"\$CONFIG\[\\'domain\\'\]=\\'(\d+)\\'")[0]

     #分析页面跳转后html页面内容

for itemObj in self.parse_content(seletor):

yield itemObj

#Ajax请求数据

item = response.meta[\'item\']

ajaxUrl = \'https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain=%s&profile_ftype=1&is_pic=1&pagebar=%s&pl_name=Pl_Official_MyProfileFeed__21&id=%s&script_uri=/u/%s&feed_type=0&page=%s&pre_page=1&domain_op=%s&__rnd=%s\'

for num in range(0,2):

rand = str(int(time.time()*1000))

url = ajaxUrl%(domain,num,pageId,item[\'uid\'],item[\'page_num\'],domain,rand)

print url

print \'------------sleep 10------------\'

time.sleep(10)

yield scrapy.Request(url=url,cookies=self.cookie,callback=self.parse_ajax)

item[\'page_num\'] += 1

nexpage = \'https://weibo.com/u/%s?is_search=0&visible=0&is_pic=1&is_tag=0&profile_ftype=1&page=%s#feedtop\'%(item[\'uid\'],item[\'page_num\'])

request = scrapy.Request(url = nexpage,cookies=self.cookie,callback=self.parse)

request.meta[\'item\'] = item

yield request

def parse_ajax(self,response):

     ```解析Ajax内容```

bodyObj = json.loads(response.body)

seletor = Selector(text=bodyObj[\'data\'])

for itemObj in self.parse_content(seletor):

yield itemObj

def parse_content(self,seletor):

     ```获取图片地址```

pre = re.compile(r\'clear_picSrc=(.*?)[\&|\\"]\')

imagelist = seletor.re(pre)

for row in imagelist:

hs = hashlib.md5()

hs.update(row)

row = urllib.unquote(row)

       #用每组图片的地址做唯一标识,和子目录名

imgset = row.split(\',\')

yield WeiboItem(image_urls=imgset,dirname=hs.hexdigest())

  

  修改Setting.py

ROBOTSTXT_OBEY = False #不遵循robots规则

注册 WeiboPipeline

ITEM_PIPELINES = {

\'webo.pipelines.WeiboPipeline\': 300,

}

  执行爬虫

scrapy crawl weibo.com

  

以上是 python Scrapy Selenium PhantomJS 爬取微博图片 - 大师兄^..^ 的全部内容, 来源链接: utcz.com/z/386812.html

回到顶部