Python--爬虫基础

python

1、

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 10:30:26 2019

@author: Office
"""
import urllib.request

#需要爬取的网站
url = "http://www.baidu.com/"

#response:向指定的url地址发起请求,并返回http响应的数据(文件的对象)
response = urllib.request.urlopen(url)

#读取内容 bytes类型
data = response.read()#读取文件的全部内容,会把读取到的数据赋值给一个字符串变量
#data = response.readline()#读取一行,若要全部打印出来,需要写一个循环
#data = response.readlines()#读取文件的全部内容,会把读取到的数据赋值给一个列表变量
#print(data)
#print(type(data))

#将文件获取的内容转换成字符串
str_data = data.decode("utf-8")
#print(str_data)
#print(type(str_data))

#将爬取到的网页写入文件
#第一种方法
with open("baidu.html","w",encoding="utf-8")as f:#以str类型的方式写入文件
f.write(str_data)

#第二种方法,urlretrieve在执行的过程中,会残留一些缓存,需要进行清除缓存
#urllib.request.urlretrieve(url,"baidu2.html")
#urllib.request.urlcleanup(url,"baidu2.html") #清除缓存

#response相关属性
#print(response.info())#返回当前环境的有关信息
#print(response.getcode())#返回状态码 只需记住200,304(客户端已经执行了get,但文件未变化,有缓存的意思),400(错误请求,如语法错误),500(服务器内部产生错误)
#print(response.geturl())#返回当前正在爬取的URl地址

 2、

 

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 15:09:34 2019

@author: Office
"""
import urllib.request
url = "http://www.baidu.com/"
#模拟请求头
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}

#设置一个请求体
req=urllib.request.Request(url,headers=headers)

#发起请求
response=urllib.request.urlopen(req)
data=response.read().decode(\'utf-8\')
print(data)

 

 3、

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 15:17:49 2019

@author: Office
"""

import urllib.request
import random
url = "http://www.baidu.com/"
#模拟请求头
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
#随机选择一个请求头
agentStr=random.choice(agentlist)
headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'User-Agent\':agentStr,
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}

#设置一个请求体
req=urllib.request.Request(url,headers=headers)

#发起请求
response=urllib.request.urlopen(req)
data=response.read().decode(\'utf-8\')
#print(data)
print(req.get_full_url()) #获取url地址
print(req.get_header(\'User-agent\')) #获取User-agent,第一个单词首字母大写,后面的单词首字母小写

#第二种写法:
url = "http://www.baidu.com/"
headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}

user_angent_list=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]

end_user_angent=random.choice(user_angent_list)

req=urllib.request.Request(url,headers=headers)
req.add_header(\'User-Agent\',end_user_angent)

response=urllib.request.urlopen(req)
data=response.read().decode(\'utf-8\')
print(data)

 4、

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 16:10:42 2019

@author: Office
"""
import urllib.request
url = "http://www.baidu.com/"
#如果网页长时间未响应,系统判断超时,无法爬取
for i in range(1,100):
try:
response=urllib.request.urlopen(url,timeout=0.2)
print(len(response.read().decode(\'utf-8\')))
except:
print("请求超时,继续下一个爬取")

 

 5、

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 16:24:45 2019

@author: Office
"""
#http 使用场景:进行客户端与服务端之间的消息传递时使用
#GET:通过url网址传递信息,可以直接在url网址上添加要传递的信息
#POST:可以向服务器提交数据,是一种比较流行的比较安全的数据传递方式
#PUT:请求服务器存储一个资源,通常要指定存储的位置
#DELETE:请求服务器删除一个资源


\'\'\'
GET请求
特点:把数据拼接到请求路径的后面传递给服务器

优点:速度快

缺点:承载的数据量小,不安全

\'\'\'
import urllib.request
import urllib.parse
import string
import random

#单个值约束
#url=\'http://www.baidu.com/s?wd=\'
#
#wd=\'图片\'
#wd=urllib.parse.quote(wd)
#end_url=url+wd
#
#headers={
# \'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
# \'X-REQUESTED-With\':\'XMLHttpRequest\',
# \'Content-Type\':\'application/x-www-form-urlencoded\'
# }
#
#user_angent_list=[
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
# ]
#
#end_user_angent=random.choice(user_angent_list)
#
#req=urllib.request.Request(end_url,headers=headers)
#req.add_header(\'User-Agent\',end_user_angent)
#
#response=urllib.request.urlopen(req)
#data=response.read().decode(\'utf-8\')
#print(data)

 

#多个值约束
url=\'https://www.baidu.com/s?\'
da_ta={
\'wd\':\'风景\',
\'key\':\'zhang\',
\'value\':\'san\'
}
final_da_ta=urllib.parse.urlencode(da_ta)

final_url=url+final_da_ta

end_url=urllib.parse.quote(final_url,safe=string.printable)
print(end_url)
headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}

user_angent_list=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]

end_user_angent=random.choice(user_angent_list)
headers[\'User-Agent\']=end_user_angent
req=urllib.request.Request(end_url,headers=headers)
response=urllib.request.urlopen(req)
data=response.read().decode(\'utf-8\')
print(data)

 

 6、

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 28 16:50:51 2019

@author: Office
"""
\'\'\'
POST 请求
特点:把参数进行打包,单独传输

优点:数量大,安全(当对服务器数据进行修改时建议使用post)

缺点:速度慢
\'\'\'

import urllib.parse
import urllib.request
url=\'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule\'

headers={
\'Accept\':\'application/json, text/javascript, */*; q=0.01\',
\'Content-Type\':\'application/x-www-form-urlencoded; charset=UTF-8\',
\'Referer\':\'http://fanyi.youdao.com/?keyfrom=dict2.index\',
\'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\',
\'X-Requested-With\':\'XMLHttpRequest\'
}
#将要发送的数据合成一个字典
#字典的键去网址里找,一般为input标签的name属性的值

key=input("请输入你要翻译的内容:")
data={
\'i\' : key,
\'from\' : \'AUTO\',
\'to\' : \'AUTO\',
\'smartresult\' : \'dict\',
\'client\' : \'fanyideskweb\',
\'salt\': \'15564473252080\',
\'sign\': \'b6f44d14938df7391a28b66252a461aa\',
\'doctype\' : \'json\',
\'version\' : \'2.1\',
\'keyfrom\' : \'fanyi.web\',
\'action\' : \'FY_BY_CLICKBUTTION\'
}
#将要发送的数据进行打包,记住编码
da_ta=urllib.parse.urlencode(data).encode(\'utf-8\')
#请求
end_data=urllib.request.urlopen(url,da_ta).read().decode(\'utf-8\')
print(end_data)

 7、

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 29 11:02:48 2019

@author: Office
"""

from bs4 import BeautifulSoup
import urllib.request

#转化本地文件
soup = BeautifulSoup(open("soup_text.html",encoding="utf-8"),\'lxml\')

#根据标签名查找
#print(soup.a) 只能找到第一个符合要求的标签
#print(soup.div)

#获取属性
#print(soup.a["href"])获取href属性
#print(soup.a.attrs)获取属性和值,返回的是一个字典
#print(soup.a.attrs["href"])也可以这样写

#获取内容
#print(soup.a.text)
#print(soup.a.string)
#print(soup.a.get_text())

#三者的区别:如果标签中还有标签,那么string获取到的结果为None,而另外两个,可以获取文本内容
#print(soup.div.text)
#print(soup.div.string)
#print(soup.div.get_text)
#print(soup.div.get_text().split()[0])#获取里面元素

#find 找到的都是第一个符合要求的标签
#print(soup.find(\'a\'))#找到第一个符合要求的a
#print(soup.find(\'a\',title="qin"))#通过第二条件title="qin"进行限制来查找
#print(soup.find(\'a\',class_="du"))#由于class是关键字,所以需要加一个下划线
#print(soup.find(\'a\',id="feng"))

#find方法不仅可以适用于soup,普通的对象也可以适用,会去指定的普通对象里面去朝招符合要求的节点。
#通过层级的方式,往下查找
#div=soup.find(\'div\',class_=\'tang\')
#print(div.find(\'a\',alt="qi"))
#print(div.find(\'a\',class_="du"))#如果有两个相同,还是找到第一个符合要求的

#find_all
#lt=soup.find_all(\'a\')#找所有a的标签
#print(lt,len(lt))

#div=soup.find(\'div\',class_=\'tang\')
#print(div.find_all(\'a\'))
#print(div.find_all([\'i\',\'b\']))#find_all 里面还可以接多个标签,以列表的形式
#print(div.find_all(\'a\',limit=2))#找到所有取前面2个

#select 根据选择器选择指定的内容
#常见的选择器:标签选择器,类选择器,id选择器,组合选择器,层级选择器,属性选择器
#选择器返回的永远是列表,需要通过下表提取指定的对象,然后获取属性和节点
#print(soup.select(\'div > u1 > li > a\'))#标签和大于符号之间必须有空格
#print(soup.select(\'div > u1 > li > a\')[0])#取第一个
#print(soup.select(\'.tang > u1 > li > a\')[0])#也可以这样写,返回的结果和上面一样
#print(soup.select(\'#du\'))#id可以这样写
#print(soup.select(\'#feng\')[0].text)#返回的是一个列表,取值的话,必须先通过下表取出来,在调用获取内容的函数
#print(soup.select(\'#feng\')[0][\'href\'])#返回的是href的值

#select 方法也可以通过普通对象调用,找到都是这个对象下面符合要求的所有节点
#div=soup.find(\'div\',class_=\'tang\')
#print(div.select(\'.du\'))
#print(soup.select(\'.du\'))

 8、

# -*- coding: utf-8 -*-
"""
Created on Wed May 1 11:05:33 2019

@author: admin
"""

import urllib.request
import urllib.parse
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

url=\'http://www.renren.com/970622703/profile\'
headers={
\'user-agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36\',
\'Cookie\':\'anonymid=jv4jjsmt8luy21; ln_uact=17767258153; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; __guid=238633222.311801873786504100.1556674290342.3481; jebe_key=51ea37de-35c3-4754-82ed-2cc4fbe57341%7C0ff20ead6ae99fd72934c187b694b4f1%7C1556674288363%7C1%7C1556674291856; jebe_key=51ea37de-35c3-4754-82ed-2cc4fbe57341%7C0ff20ead6ae99fd72934c187b694b4f1%7C1556674288363%7C1%7C1556674291858; wp_fold=0; depovince=GW; _r01_=1; JSESSIONID=abcnRiMszrXoLbNlVdXPw; ick_login=4c390ed0-4fe6-4264-b9b2-610a614ac13c; first_login_flag=1; jebecookies=989247e8-b114-48f9-9592-aec3cd10e92b|||||; _de=7266BDD6184F288A5EF7AB01E3CFE338; p=38e98cbf34016e9010c9f1f73791f2423; t=3b04ed4095e7a4b7612203f7169bbc843; societyguester=3b04ed4095e7a4b7612203f7169bbc843; id=970622703; xnsid=8ebbfe1f; ver=7.0; loginfrom=null; monitor_count=9\',
}

req=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(req)
print(response.read().decode(\'utf-8\'))

 9、

# -*- coding: utf-8 -*-
"""
Created on Sun Jul 15 08:52:30 2018

@author: T0001
"""

html=\'\'\'<tr>
<td class="posterColumn">
<span name="rk" data-value="1"></span>
<span name="ir" data-value="9.216510839765467"></span>
<span name="us" data-value="7.791552E11"></span>
<span name="nv" data-value="1868842"></span>
<span name="ur" data-value="-1.7834891602345326"></span>
<div class="unseeable">NOT YET RELEASED</div>
</td>
<td class="titleColumn">
1.
<a href="/title/tt0111161" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman" >The Shawshank Redemption</a>
<span class="secondaryInfo">(1994)</span>
</td>
<td class="ratingColumn imdbRating">
<span title="9.2 based on 1,868,842 user ratings">9.2</span>
</td>
<td class="ratingColumn">
<div class="seen-widget seen-widget-tt0111161 pending" data-titleid="tt0111161">
<div class="inline">
<div class="pending">3.2</div>
<div class="unseeable">NOT YET RELEASED</div>
<div class="unseen">4.5</div>
<div class="rating"></div>
<div class="seen">Seen</div>
</div>
</div>
</td>
<td class="watchlistColumn">
<div class="wlb_ribbon" data-tconst="tt0111161" data-recordmetrics="true"></div>
</td>
</tr>
\'\'\'

from lxml import etree

#本地打开
#tree=etree.parse("文件名")

#网络打开
#tree=etree.HTML("网页字符串")

imdb=etree.HTML(html)

#属性定位
#print(imdb.xpath(\'//span[@name="ir"]\'))
#print(imdb.xpath(\'//div[@data-tconst]\'))

#层级和索引混合定位
#print(imdb.xpath(\'//div[@class="seen-widget seen-widget-tt0111161 pending"]/div/div[1]\'))#索引从1开始
#print(imdb.xpath(\'//div[@class="seen-widget seen-widget-tt0111161 pending"]/div/div[@class="unseeable"]\'))#也可以通过属性定位

#print(imdb.xpath(\'//td[@class="ratingColumn"]//div\'))#<td class="ratingColumn">下面所有的div
#print(imdb.xpath(\'//td[@class="ratingColumn"]//div[@class="seen"]\'))#后面也可以用属性来定位

#result1=imdb.xpath(\'//div[@class="inline"]/div[last()-2]\')#


#逻辑运算
#print(imdb.xpath(\'//div[@class="wlb_ribbon"and @data-tconst="tt0111161"]\')) 如果一个属性不能限制,也可以加一个属性,他们之间用and

#模糊匹配
#print(imdb.xpath(\'//div[contains(@class,"un")]\'))#所有的div ,有class属性,并且属性中含有un的节点
#print(imdb.xpath(\'//div[contains(text(),4)]\'))#所有的值 ,含有4的节点
#print(imdb.xpath(\'//div[starts-with(@class,"r")]\'))#所有的div ,有class属性,并且属性中以r开头的节点

#取文本内容
#print(imdb.xpath(\'//div[@class="inline"]/div[5]/text()\'))获取节点内容

#取属性
#print(imdb.xpath(\'//div[@class="inline"]/div[2]/@class\'))

#print(imdb.xpath(\'//div[@class="inline"]//text()\'))#将<div class="inline">后面的所有节点里面不带标签的所有内容给取出来

#print(imdb.xpath(\'//div[@class="inline"]/div[last()-1]/@class\'))#也可以通过这种方式获取

#将1.给取出来
#s=imdb.xpath(\'//td[@class="titleColumn"]/text()\')
#a=[]
#for i in s:
# if i.strip() != "":
# a.append(i.strip())

#s=imdb.xpath(\'//td[@class="titleColumn"]\')
#k=s[0].xpath(\'string(.)\')
#l=k.replace(\'\n\', \'\').replace(\'\t\', \'\')
#print(l.strip().split()[0])

#for i in result:
# print(etree.tostring(i))

 10、

# -*- coding: utf-8 -*-
"""
Created on Wed May 1 11:13:30 2019

@author: admin
"""

import urllib.request
import urllib.parse


url=\'http://www.baidu.com/\'
proxy={
\'http\':\'222.135.92.68:38094\'
}

#创建handler
handler=urllib.request.ProxyHandler(proxy)
#创建opener
opener=urllib.request.build_opener(handler)

headers={
\'user-agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36\'
}
req=urllib.request.Request(url,headers=headers)
response=opener.open(req)
print(response.read().decode(\'utf-8\'))

 11、

# -*- coding: utf-8 -*-
"""
Created on Sun Jul 15 11:37:22 2018

@author: T0001
"""

from lxml import etree
import numpy as np
import pandas as pd
import urllib.request
import random
url=\'http://news.ceic.ac.cn/\'
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)

headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}
headers[\'User-Agent\']=agentStr
req=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(req).read().decode(\'utf-8\')

earth=etree.HTML(response)
result=earth.xpath(\'//td[@align="center"]/text()\')
result1=earth.xpath(\'//td[@align="left"]/a/text()\')

data=np.array(result).reshape((-1,5))

c=np.column_stack((data,result1))

pd.DataFrame(c,columns=[\'gf\',\'gdf\',\'dsf\',\'dsgf\',\'fdg\',\'dfgh\']).to_csv(\'dz.csv\',index=False)

 

 12、

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 19:06:01 2019

@author: Office
"""
import urllib.request
import ssl
import random
import json
import pandas as pd
from sqlalchemy import create_engine
#模拟请求头
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
#随机选择一个请求头
agentStr=random.choice(agentlist)
def ajaxCrawler(url):
headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'User-Agent\':agentStr,
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}
req=urllib.request.Request(url,headers=headers)
# 使用ssl创建未验证的上下文
context=ssl._create_unverified_context()
response=urllib.request.urlopen(req,context=context)

jsonStr=response.read().decode(\'utf-8\')
jsonData=json.loads(jsonStr)
return jsonData

title=[]
score=[]
release_date=[]
vote_count=[]
for i in range(1,100):
url=\'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=\'+str(i*20)+\'&limit=20\'
info=ajaxCrawler(url)
for j in info:
title.append(j["title"])
score.append(j[\'score\'])
release_date.append(j[\'release_date\'])
vote_count.append(j[\'vote_count\'])

#转化为DataFrame
data=pd.DataFrame({\'score\':score,\'title\':title,\'release_date\':release_date,\'vote_count\':vote_count},columns=[\'score\',\'title\',\'release_date\',\'vote_count\'])
#保存到excel
#data.to_csv(\'dy.csv\')

#保存到mysql
engine=create_engine(\'mysql+pymysql://root:123456@localhost/demo\')
data.to_sql(\'douban\',engine,if_exists="replace")

 13、

# -*- coding: utf-8 -*-
"""
Created on Thu Jun 13 20:12:39 2019

@author: wqq
"""

import urllib.request
import re
import random
import gzip
import numpy as np
import pandas as pd
url="http://esf.hz.fang.com/"
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)

headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}
headers[\'User-Agent\']=agentStr
req=urllib.request.Request(url,headers=headers)

response=urllib.request.urlopen(req)
#print(response.info().get(\'Content-Encoding\'))
string=gzip.decompress(response.read()).decode(\'gbk\')

phone_reg=r\'\'\'<span class="red"><b>(.*?)</b>\'\'\'
phone_pat=re.compile(phone_reg)
z_jia=re.findall(phone_pat,string)

phone_reg=r\'\'\'<span>(.*?)元/㎡</span>\'\'\'
phone_pat=re.compile(phone_reg)
d_jia=re.findall(phone_pat,string)


phone_reg=r\'\'\'<p class="tel_shop">(.*?)<span class="people_name">\'\'\'
phone_pat=re.compile(phone_reg,re.S)
match=re.findall(phone_pat,string)

g_ju=[]
m_ji=[]
l_ceng=[]
c_xiang=[]
n_dai=[]
for i in match:
k=(i.split())
g_ju.append(k[0])
m_ji.append(k[1].split("<i>|</i>")[1])
if "<i>|</i>" not in k[2]:
l_ceng.append(k[2])
else:
l_ceng.append(k[2].split("<i>|</i>")[1])

if "<i>|</i>" not in k[3]:
c_xiang.append(k[3])
else:
c_xiang.append(k[3].split("<i>|</i>")[1])

if "<i>|</i>" not in k[4]:
n_dai.append(k[4])
else:
n_dai.append(k[4].split("<i>|</i>")[1])

phone_reg=r\'\'\'<a target="_blank" href="/house-xm\d+/" title=(.*?)>\'\'\'
phone_pat=re.compile(phone_reg)
g_yu_name=re.findall(phone_pat,string)


phone_reg=r\'\'\'<span class="tit_shop">(.*?)</span>\'\'\'
phone_pat=re.compile(phone_reg)
title=re.findall(phone_pat,string)

phone_reg=r\'\'\'<span>(.*?)</span>\'\'\'
phone_pat=re.compile(phone_reg)
d_duan=re.findall(phone_pat,string)[::2]
d_duan.remove(d_duan[-1])

pd.DataFrame({\'title\':title,\'g_ju\':g_ju,
\'m_ji\':m_ji,\'l_ceng\':l_ceng,
\'c_xiang\':c_xiang,\'n_dai\':n_dai,
\'z_jia(万)\':z_jia,\'d_jia(元/m2)\':d_jia,
\'g_yu_name\':g_yu_name,\'d_duan\':d_duan},
columns=[\'title\',\'g_ju\',\'m_ji\',\'l_ceng\',\'c_xiang\',\'n_dai\',\'z_jia(万)\',\'d_jia(元/m2)\',\'g_yu_name\',\'d_duan\']).to_csv("二手房.csv",index=False)

 

 14、

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 29 08:32:04 2019

@author: Office
"""
import urllib.request
import random
import re

def handle_request(url,page=None):
if page != None:
url=url+str(page)+".html"
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
#随机选择一个请求头
agentStr=random.choice(agentlist)
headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'User-Agent\':agentStr,
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}
request=urllib.request.Request(url,headers=headers)
return request

def get_text(a_href):
#调用函数构造请求对象
request = handle_request(a_href)
#发送请求,获得响应
content = urllib.request.urlopen(request).read().decode(\'utf-8\')
#解析内容
pattern = re.compile(r\'<div class="neirong">(.*?)</div>\',re.S)
lt = pattern.findall(content)
text = lt[0]

#写个正则,将内容里面所有的图片标签全部清空
pat=re.compile(r\'<img .*?>\')
text=pat.sub(\'\',text)
return text

def parse_content(content):
#写正则
pattern=re.compile(r\'<h3><a href="(/lizhi/qianming/\d+\.html)">(.*?)</a></h3>\')
#返回的lt是一个列表,列表中的元素都是元组,元组中的第一个元素就是正则
#中第一个小括号匹配到的内容,元组中的第二个元素就是正则中第二个小括号
#匹配到的内容
lt=pattern.findall(content)
#遍历列表
for href_title in lt:
#获取内容的链接
a_href = \'http://www.yikexun.cn\' + href_title[0]
#获取标题
title = href_title[-1]
#向a_href发送请求,获取响应内容
text = get_text(a_href)
#写入到html文件中
string = \'<h1>%s</h1>%s\' % (title,text)
with open (\'lizhi.html\', \'a\' , encoding=\'utf8\') as f:
f.write(string)

def main():
url=\'http://www.yikexun.cn/lizhi/qianming/list_50_\'
start_page=int(input(\'请输入起始页码:\'))
end_page=int(input(\'请输入结束页码:\'))
for page in range(start_page,end_page):
#根据url和page去生成指定的request
request=handle_request(url,page)
content=urllib.request.urlopen(request).read().decode(\'utf-8\')

#解析内容
parse_content(content)

main()

15、

# -*- coding: utf-8 -*-
"""
Created on Sun Jul 15 14:16:22 2018

@author: T0001
"""
#爬取图片

import urllib.request
from lxml import etree
import random

url="https://www.ivsky.com/tupian/ziranfengguang/"
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)

headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'User-Agent\':agentStr,
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}


proxy=[
{\'http\':\'http://61.164.39.66:53281\'} ,
{\'http\':\'http://116.209.57.18:9999\'},
{\'http\':\'http://183.148.133.77:9999\'},
{\'http\':\'http://211.23.149.29:80\'},
{\'http\':\'http://39.137.69.10:8080\'}
]

end_proxy=random.choice(proxy)

proxy_handler=urllib.request.ProxyHandler(end_proxy)

opener=urllib.request.build_opener(proxy_handler)

req=urllib.request.Request(url,headers=headers)

response=opener.open(req)
html=response.read().decode("utf-8")

html=etree.HTML(html)
a=html.xpath(\'//div[@class="il_img"]/a/@href\')

for i in a:
url_new="https://www.ivsky.com"+i
req1=urllib.request.Request(url,headers=headers)
response1=opener.open(req1)
html1=response1.read().decode("utf-8")
html_pic=etree.HTML(html1)
pic=html_pic.xpath(\'//div[@class="il_img"]/a/img/@src\')

for j in pic:
end_url="https:"+j
req2=urllib.request.Request(end_url,headers=headers)
response2=opener.open(req2)
html2=response2.read()
with open(\'pic/\'+j.split(\'/\')[-1],\'wb\') as f:
f.write(html2)

 

 16、

# -*- coding: utf-8 -*-
"""
Created on Wed May 1 17:33:25 2019

@author: admin
"""
import urllib.request
url=\'http://www.baidu.com/\'

#创建handler
handler=urllib.request.HTTPHandler()
#创建opener
opener=urllib.request.build_opener(handler)

headers={
\'user-agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36\'
}
req=urllib.request.Request(url,headers=headers)
response=opener.open(req)
print(response.read().decode(\'utf-8\'))

17、

# -*- coding: utf-8 -*-
"""
Created on Fri Apr 26 08:37:26 2019

@author: Office
"""

import urllib.request
import random
import re

url="https://www.qiushibaike.com/text/page/1/"
proxy=[
{\'http\':\'http://61.164.39.66:53281\'} ,
{\'http\':\'http://116.209.57.18:9999\'},
{\'http\':\'http://183.148.133.77:9999\'},
{\'http\':\'http://211.23.149.29:80\'},
{\'http\':\'http://39.137.69.10:8080\'}
]

end_proxy=random.choice(proxy)

proxy_handler=urllib.request.ProxyHandler(end_proxy)

opener=urllib.request.build_opener(proxy_handler)

agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)

headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'User-Agent\':agentStr,
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}
req=urllib.request.Request(url,headers=headers)

response=opener.open(req)
html=response.read().decode(\'utf-8\')
print(html)
pat=r\'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">\'
re_joke=re.compile(pat,re.S)
divsList=re_joke.findall(html)
dic={}
for i in divsList:
#用户名
re_u=re.compile(r\'<h2>(.*?)</h2>\',re.S)
username=re_u.findall(i)
username=username[0]

#段子
re_d=re.compile(r\'<div class="content">\n<span>(.*?)</span>\',re.S)
duanzi=re_d.findall(i)
duanzi=duanzi[0]
dic[username]=duanzi
print(dic)

 18、

# -*- coding: utf-8 -*-
"""
Created on Wed May 1 08:50:22 2019

@author: admin
"""

import urllib.request
import urllib.parse
import http.cookiejar
import ssl

ssl._create_default_https_context = ssl._create_unverified_context
#真实的模拟浏览器,当发送完post请求的时候,将cookie保存到代码中
#创建一个cookiejar对象
cj=http.cookiejar.CookieJar()
#创建一个cookiejar 创建一个handler
handler=urllib.request.HTTPCookieProcessor(cj)
#根据handler创建一个opener
opener=urllib.request.build_opener(handler)

url=\'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201943946542 \'

headers={
\'user-agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36\'
}

fromdata={
\'email\':\'17767258153\',
\'icode\':\'\' ,
\'origURL\':\'http://www.renren.com/home\',
\'domain\':\'renren.com\',
\'key_id\':\'1\',
\'captcha_type\':\'web_login\',
\'password\':\'204b8409cfb80c1d46a7134d150cd281a1808d1c0429eb7334a3fa8f4c6ae327\',
\'rkey\':\'b8871697112ad27ac3a61f5e85ebf5b4\',
\'f\':\'http%3A%2F%2Fwww.renren.com%2F970622703\',
}

fromdata=urllib.parse.urlencode(fromdata).encode(\'utf-8\')
req=urllib.request.Request(url,headers=headers)
response=opener.open(req,data=fromdata)
#print(response.read().decode(\'utf-8\'))

get_url="http://www.renren.com/970622703/profile"
req=urllib.request.Request(get_url,headers=headers)
response=opener.open(req)
print(response.read().decode(\'utf-8\'))

19、

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 28 11:28:33 2019

@author: Office
"""

import urllib.request
from lxml import etree
import random
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

url=\'http://tubiao.17mcp.com/Ssq/index-500.html\'
agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)

headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'User-Agent\':agentStr,
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}

req=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(req).read().decode(\'utf-8\')
html=etree.HTML(response)
data=html.xpath(\'//tr/td[@style="color:White"]/text()\')
qihao=html.xpath(\'//tr[@style="height: 25px"]/td[1]/text()\')

da_ta=np.array(data).reshape(-1,7)
qi_hao=np.array(qihao)

end_data=np.column_stack((qi_hao,da_ta))

finnal_data=pd.DataFrame(end_data,columns=[\'qihao\',\'one\',\'two\',\'three\',\'four\',\'five\',\'six\',\'seven\'])

#保存到excel
finnal_data.to_csv(\'双色球.csv\',index=False)

#保存到mysql
engine=create_engine(\'mysql+pymysql://root:123456@localhost/demo\')
finnal_data.to_sql(\'shungseqiu\',engine,if_exists="replace")

 

20、

# -*- coding: utf-8 -*-
"""
Created on Fri Apr 26 15:36:41 2019

@author: Office
"""

import urllib.request
import random
import re

keyname="chakra bracelet"
key=urllib.request.quote(keyname)
for i in range(1,2):
try:
print("--------正在爬第"+str(i)+"页------------")
url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
proxy=[
{\'http\':\'http://61.164.39.66:53281\'} ,
{\'http\':\'http://116.209.57.18:9999\'},
{\'http\':\'http://183.148.133.77:9999\'},
{\'http\':\'http://211.23.149.29:80\'},
{\'http\':\'http://39.137.69.10:8080\'}
]

end_proxy=random.choice(proxy)

proxy_handler=urllib.request.ProxyHandler(end_proxy)

opener=urllib.request.build_opener(proxy_handler)

agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr=random.choice(agentlist)

headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'User-Agent\':agentStr,
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}
req=urllib.request.Request(url,headers=headers)

response=opener.open(req)
data=response.read().decode("utf-8","ignore")

pat=\'"pic_url":"//(.*?)"\'
imglist=re.compile(pat).findall(data)
for j in range(0,len(imglist)):
try:
thisimg=imglist[j]
thisimgurl="http://"+thisimg
localfile="D:/"+str(i)+"_"+str(j)+".jpg"
urllib.request.urlretrieve(thisimgurl,filename=localfile)
except Exception as err:
pass
except Exception as err:
pass

 

 21、

# -*- coding: utf-8 -*-
"""
Created on Thu Jun 13 18:12:39 2019

@author: wqq
"""

import urllib.request
import urllib.parse
import ssl
import random
from lxml import etree
import pandas as pd

ssl._create_default_https_context = ssl._create_unverified_context
url = \'https://veromoda.tmall.com/p/rd609297.htm?spm=a1z10.10672-b-s.w5001-17277175636.16.7b822b67cHKn8X&scene=taobao_shop\'

agentlist=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0" ,
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"]
agentstr = random.choice(agentlist)

headers = {
\'user-agent\':agentstr,
\'Accept\': \'image/webp,*/*\',
\'Cookie\': \'cq=ccp%3D1; cna=OA95FVY8Iw4CAXAKF/liwJ5M; isg=BI2N3dH67G6QcEhAxVcwy0Dzn6nHwsFXFVAU088SySSTxq14l7rRDNtcMJoFHdn0; l=bBNzmI9HqQPbVy7kBOCwquI8aG7OSIOYYuPRwNqXi_5ay1T_qsQOkjo1oe96Vs5RsXTB4mxQgLp9-etks; hng=""; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; t=2e15a61bdd752ef76d25e931fbd573ee; lid=%E4%BD%8E%E8%B0%83leoalan; _tb_token_=e1b6ee565fbb5; cookie2=1f4e270456996b258181536824f34637\'

}

req = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(req)
data = response.read().decode(\'gbk\')

tree = etree.HTML(data)
imdb = etree.HTML(data)

title = imdb.xpath(\'//span[@class="user_name"]/text()\')
adress = imdb.xpath(\'//div[@class="user_w990"]//a[@target="_blank"]/@href\')
price = imdb.xpath(\'//span/span[@class="user_pricetit"]/text()\')
#oldprice = imdb.xpath(\'//span/span[@class="user_ft14 user_yj"]/text()\')

a=0
for i in adress:
i = \'https:\'+i
adress[a] = i
a+=1

pd.DataFrame({
\'商品名称\':title,
\'商品链接\':adress,
\'商品价格\':price
},
columns=[\'商品名称\',\'商品链接\',\'商品价格\']
).to_excel(\'D:/天猫商品.xls\')

22、

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 28 19:37:12 2019

@author: Office
"""

import urllib.request
import urllib.parse
from lxml import etree
import time
import random
import os

def handle_request(url,page):
#由于第一页和后面的页码规律不一样,所以要进行判断
if page == 1:
url = url.format(\'\')
else:
url = url.format(\'_\' + str(page))
headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}

user_angent_list=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]

end_user_angent = random.choice(user_angent_list)
headers[\'User-Agent\'] = end_user_angent
request = urllib.request.Request(url,headers=headers)
return request
#解析内容
def parse_content(content):
tree = etree.HTML(content)
image_list = tree.xpath(\'//div[@id="container"]/div/div/a/img/@src2\')
#懒加载
#遍历列表,依次下载图片
for image_src in image_list:
download_image(image_src)

def download_image(image_src):
dirpath = \'xinggan\'
#创建一个文件夹
if not os.path.exists(dirpath):
os.mkdir(dirpath)
#搞个文件名
filename = os.path.basename(image_src)
#图片路径
filepath = os.path.join(dirpath,filename)
#发送请求,保存图片
headers={
\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'X-REQUESTED-With\':\'XMLHttpRequest\',
\'Content-Type\':\'application/x-www-form-urlencoded\'
}

user_angent_list=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]

end_user_angent = random.choice(user_angent_list)
headers[\'User-Agent\'] = end_user_angent
request = urllib.request.Request(image_src,headers=headers)
response = urllib.request.urlopen(request)
with open(filepath,\'wb\') as f:
f.write(response.read())

def main():
url = \'http://sc.chinaz.com/tupian/xingganmeinvtupian.html\'
start_page = int(input(\'请输入起始页码:\'))
end_page = int(input(\'请输入结束页码:\'))
for page in range(start_page,end_page+1):
request = handle_request(url,page)
content = urllib.request.urlopen(request).read().decode(\'utf-8\')
parse_content(content)
time.sleep(2)

if __name__ == \'__main__\':
main()

23、

# -*- coding: utf-8 -*-
"""
Created on Sat Jun 30 21:07:14 2018

@author: Chen
"""

import pydotplus
import os
os.environ["PATH"] += os.pathsep + \'C:/Program Files (x86)/Graphviz2.38/bin/\'
import pandas as pd

#读取csv文件到dataframe
df = pd.read_csv(\'./data.csv\')
#print(df.head())#测试的时候用

df = df[[\'weather\',\'temperature\',\'humidity\',\'wind\',\'sports\']]
df[\'weather\'] = df[\'weather\'].map({\'晴\': 0, \'阴\': 1, \'雨\': 2})
df[\'temperature\'] = df[\'temperature\'].map({\'炎热\': 0, \'适中\': 1, \'寒冷\': 2})
df[\'wind\'] = df[\'wind\'].map({\'弱\': 0, \'强\': 1})

#分成事实表,和分类表
df = df.dropna()
X = df.drop(\'sports\', axis=1)
Y = df[\'sports\']

\'\'\'
#分成训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1)
\'\'\'

from sklearn import tree
model = tree.DecisionTreeClassifier()
#用测试集学习
#model.fit(X_train, y_train)
#用全集学习
model.fit(X, Y)

\'\'\'
#通过测试集测试模型的准确度
y_predict = model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)
\'\'\'

#生成可视化的树
dot_data = tree.export_graphviz(model.tree_, out_file=None,
feature_names=X.columns,
class_names=[\'no\',\'yes\'],
filled=True, rounded=True, # leaves_parallel=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)

nodes = graph.get_node_list()

for node in nodes:
if node.get_label():
values = [int(ii) for ii in node.get_label().split(\'value = [\')[1].split(\']\')[0].split(\',\')];
color = {0: [255,255,224], 1: [255,224,255], 2: [224,255,255],}
values = color[values.index(max(values))]; # print(values)
color = \'#{:02x}{:02x}{:02x}\'.format(values[0], values[1], values[2]); # print(color)
node.set_fillcolor(color )

graph.write_pdf("tree.pdf")
graph.write_png("tree.png")

#

 

以上是 Python--爬虫基础 的全部内容, 来源链接: utcz.com/z/387943.html

回到顶部