使用Python爬虫代理增加网站流量

python

获得了免费的代理列表,那么就有很多事情可以干,比如 , 爬取某个网站并且没有被封IP的风险, 比如, 增加某网站的流量。

完整代码:

#coding:utf-8

import urllib2

import urllib

import cookielib

import hashlib

import re

import time

import json

import unittest

from selenium import webdriver

from bs4 import BeautifulSoup

from pip._vendor.distlib._backport.tarfile import TUREAD

from time import sleep

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import random

class Spide:

    def __init__(self,proxy_ip,proxy_type,proxy_port,use_proxy=False):

        print 'using the proxy info :',proxy_ip

        self.proxy_ip = proxy_ip

        self.proxy_type = proxy_type

        self.proxy_port = proxy_port

        self.proxy = urllib2.ProxyHandler({proxy_type: proxy_ip+":"+proxy_port})

        self.usercode = ""

        self.userid = ""

        self.cj = cookielib.LWPCookieJar();

        

        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj));

        if use_proxy:

            self.opener = urllib2.build_opener(self.proxy)

        urllib2.install_opener(self.opener);

        

    def add_view(self):

        print '--->start adding view'

        print '--->proxy info',self.proxy_ip

        service_args = [

                        '--proxy='+self.proxy_ip+':'+self.proxy_port,

                         '--proxy-type='+self.proxy_type,

                        ]

        dcap = dict(DesiredCapabilities.PHANTOMJS)

        dcap["phantomjs.page.settings.userAgent"] = (

                 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "

                 "(KHTML, like Gecko) Chrome/15.0.87"

        )

        

        driver = webdriver.PhantomJS(executable_path='/home/bin/phantomjs',service_args=service_args,desired_capabilities=dcap)

        driver.set_page_load_timeout(90)

        driver.get("http://www.503error.com/")

        soup = BeautifulSoup(driver.page_source, 'xml')

        titles = soup.find_all('h1', {'class': 'entry-title'})

        ranCount = random.randint(0,len(titles))

        print 'random find a link of the website to access , random is :',ranCount

        randomlink = titles[ranCount].a.attrs['href']

        

        driver.get(randomlink)

        

        driver.close()

        print 'finish once'

    def get_proxy(self):

        proxy_info_json = ""

        #first get the proxy info from 

        print '-->using the ip'+self.proxy_ip+'to get the proxyinfo'

        try:

            reqRequest_proxy =  urllib2.Request('url2');

            reqRequest_proxy.add_header('Accept','*/*');

            reqRequest_proxy.add_header('Accept-Language','zh-CN,zh;q=0.8');

            reqRequest_proxy.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36');

            reqRequest_proxy.add_header('Content-Type','application/x-www-form-urlencoded');

            

            

            proxy_info = urllib2.urlopen(reqRequest_proxy).read();

            print proxy_info

            proxy_info_json = json.loads(proxy_info)

            return_str=proxy_info_json['protocol']+":"+proxy_info_json['ip']+proxy_info_json['port']

        except Exception,e:     

            print 'proxy have problem'

            #print proxy_info_json['protocol']

            #print proxy_info_json['ip']

            #print proxy_info_json['port']

        return proxy_info_json

        #print proxy_info

    def get_proxys100(self):

        proxy_info_json = ""

        #first get the proxy info from

        print '-->using the ip'+self.proxy_ip+'to get the proxyinfo100'

        try:

            reqRequest_proxy =  urllib2.Request('url1');

            reqRequest_proxy.add_header('Accept','*/*');

            reqRequest_proxy.add_header('Accept-Language','zh-CN,zh;q=0.8');

            reqRequest_proxy.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36');

            reqRequest_proxy.add_header('Content-Type','application/x-www-form-urlencoded');

            proxy_info = urllib2.urlopen(reqRequest_proxy).read();

            #print proxy_info

            proxy_info_json = json.loads(proxy_info)

            #for porxy_i in proxy_info_json:

            #    print porxy_i

            #return_str=proxy_info_json['protocol']+":"+proxy_info_json['ip']+proxy_info_json['port']

            return proxy_info_json

        except Exception,e:

            print 'proxy have problem'

    

if __name__ == "__main__":

    #firs time get the proxy

    print 'START ADDING VIEW:'

    print 'Geting the new proxy info First time'

    print '---------------------------------------------------------------------------------------------------------'

    for count in range(1):

        test = Spide(proxy_ip='youproxyip',proxy_type='http',proxy_port='3128',use_proxy=False)

        proxy_list = test.get_proxy()

        print '->this is the :',count

        print '->Geting the new proxy info:'

        

        print '->using the proxy to get proxy list incase forbiden'

        

        print '->proxy info',proxy_list

        

        proxy100 = test.get_proxys100()

        for proxy1 in proxy100:

            try:

                print 'proxy1:',proxy1

                Spide1=Spide(proxy_ip=proxy1['ip'],proxy_type=proxy1['type'],proxy_port=proxy1['port'],use_proxy=True)

                print 'before add view'

                Spide1.add_view()

                print '->sleep 15 s'

                time.sleep(15)

                #sleep random time to 

                ranTime = random.randint(10,50)

                print '->sleep random time:',ranTime

                time.sleep(ranTime)

                print '-> getting new proxy '

                #proxy_list = Spide1.get_proxy()

            

            except Exception,e:

                print '->something wrong ,hahah ,next'

一点小的注释:

整个流程为: 1 获取代理 ->2 访问首页 —>3 获取首页博客列表,随机访问->4随机等待N秒 ->返回第1步

1:你需要更改youproxyip为你一个你已经拥有的代理ip,或者,不用填写,因为后边的use_proxy=False, 这个时候你确保你能够不适用代理访问到代码中的两个自动抓取代理ip地址的网站

2:/home/bin/phantomjs 这个路径是你安装的phantomjs路径

3:代码中有两个获取代理的方法,例子中选择了一个(不要喷我下边的循环为什么是一次还要循环,因为这个版本是原来是有外层循环的)

4: 获取免费代理地址就不写了,url1 ,url2 为隐藏的获取免费代理的网站

以上是 使用Python爬虫代理增加网站流量 的全部内容, 来源链接: utcz.com/z/522200.html

回到顶部