使用Python爬虫代理增加网站流量
获得了免费的代理列表,那么就有很多事情可以干,比如 , 爬取某个网站并且没有被封IP的风险, 比如, 增加某网站的流量。
完整代码:
#coding:utf-8import urllib2
import urllib
import cookielib
import hashlib
import re
import time
import json
import unittest
from selenium import webdriver
from bs4 import BeautifulSoup
from pip._vendor.distlib._backport.tarfile import TUREAD
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import random
class Spide:
def __init__(self,proxy_ip,proxy_type,proxy_port,use_proxy=False):
print 'using the proxy info :',proxy_ip
self.proxy_ip = proxy_ip
self.proxy_type = proxy_type
self.proxy_port = proxy_port
self.proxy = urllib2.ProxyHandler({proxy_type: proxy_ip+":"+proxy_port})
self.usercode = ""
self.userid = ""
self.cj = cookielib.LWPCookieJar();
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj));
if use_proxy:
self.opener = urllib2.build_opener(self.proxy)
urllib2.install_opener(self.opener);
def add_view(self):
print '--->start adding view'
print '--->proxy info',self.proxy_ip
service_args = [
'--proxy='+self.proxy_ip+':'+self.proxy_port,
'--proxy-type='+self.proxy_type,
]
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
"(KHTML, like Gecko) Chrome/15.0.87"
)
driver = webdriver.PhantomJS(executable_path='/home/bin/phantomjs',service_args=service_args,desired_capabilities=dcap)
driver.set_page_load_timeout(90)
driver.get("http://www.503error.com/")
soup = BeautifulSoup(driver.page_source, 'xml')
titles = soup.find_all('h1', {'class': 'entry-title'})
ranCount = random.randint(0,len(titles))
print 'random find a link of the website to access , random is :',ranCount
randomlink = titles[ranCount].a.attrs['href']
driver.get(randomlink)
driver.close()
print 'finish once'
def get_proxy(self):
proxy_info_json = ""
#first get the proxy info from
print '-->using the ip'+self.proxy_ip+'to get the proxyinfo'
try:
reqRequest_proxy = urllib2.Request('url2');
reqRequest_proxy.add_header('Accept','*/*');
reqRequest_proxy.add_header('Accept-Language','zh-CN,zh;q=0.8');
reqRequest_proxy.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36');
reqRequest_proxy.add_header('Content-Type','application/x-www-form-urlencoded');
proxy_info = urllib2.urlopen(reqRequest_proxy).read();
print proxy_info
proxy_info_json = json.loads(proxy_info)
return_str=proxy_info_json['protocol']+":"+proxy_info_json['ip']+proxy_info_json['port']
except Exception,e:
print 'proxy have problem'
#print proxy_info_json['protocol']
#print proxy_info_json['ip']
#print proxy_info_json['port']
return proxy_info_json
#print proxy_info
def get_proxys100(self):
proxy_info_json = ""
#first get the proxy info from
print '-->using the ip'+self.proxy_ip+'to get the proxyinfo100'
try:
reqRequest_proxy = urllib2.Request('url1');
reqRequest_proxy.add_header('Accept','*/*');
reqRequest_proxy.add_header('Accept-Language','zh-CN,zh;q=0.8');
reqRequest_proxy.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36');
reqRequest_proxy.add_header('Content-Type','application/x-www-form-urlencoded');
proxy_info = urllib2.urlopen(reqRequest_proxy).read();
#print proxy_info
proxy_info_json = json.loads(proxy_info)
#for porxy_i in proxy_info_json:
# print porxy_i
#return_str=proxy_info_json['protocol']+":"+proxy_info_json['ip']+proxy_info_json['port']
return proxy_info_json
except Exception,e:
print 'proxy have problem'
if __name__ == "__main__":
#firs time get the proxy
print 'START ADDING VIEW:'
print 'Geting the new proxy info First time'
print '---------------------------------------------------------------------------------------------------------'
for count in range(1):
test = Spide(proxy_ip='youproxyip',proxy_type='http',proxy_port='3128',use_proxy=False)
proxy_list = test.get_proxy()
print '->this is the :',count
print '->Geting the new proxy info:'
print '->using the proxy to get proxy list incase forbiden'
print '->proxy info',proxy_list
proxy100 = test.get_proxys100()
for proxy1 in proxy100:
try:
print 'proxy1:',proxy1
Spide1=Spide(proxy_ip=proxy1['ip'],proxy_type=proxy1['type'],proxy_port=proxy1['port'],use_proxy=True)
print 'before add view'
Spide1.add_view()
print '->sleep 15 s'
time.sleep(15)
#sleep random time to
ranTime = random.randint(10,50)
print '->sleep random time:',ranTime
time.sleep(ranTime)
print '-> getting new proxy '
#proxy_list = Spide1.get_proxy()
except Exception,e:
print '->something wrong ,hahah ,next'
一点小的注释:
整个流程为: 1 获取代理 ->2 访问首页 —>3 获取首页博客列表,随机访问->4随机等待N秒 ->返回第1步
1:你需要更改youproxyip为你一个你已经拥有的代理ip,或者,不用填写,因为后边的use_proxy=False, 这个时候你确保你能够不适用代理访问到代码中的两个自动抓取代理ip地址的网站
2:/home/bin/phantomjs 这个路径是你安装的phantomjs路径
3:代码中有两个获取代理的方法,例子中选择了一个(不要喷我下边的循环为什么是一次还要循环,因为这个版本是原来是有外层循环的)
4: 获取免费代理地址就不写了,url1 ,url2 为隐藏的获取免费代理的网站
以上是 使用Python爬虫代理增加网站流量 的全部内容, 来源链接: utcz.com/z/522200.html