python怎么爬取表格?
Python爬取表格的步骤:
1、检查url地址,用raise_for_status()检查url地址;
2、爬取资源,用BeautifulSoup()爬取数据,并且用find_all('tr')抓取其中的表格;
3、保存资源,用write()将表格保存到指定目录。
from bs4 import BeautifulSoupimport requests
import csv
import bs4
#检查url地址
def check_link(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('无法链接服务器!!!')
#爬取资源
def get_contents(ulist,rurl):
soup = BeautifulSoup(rurl,'lxml')
trs = soup.find_all('tr')
for tr in trs:
ui = []
for td in tr:
ui.append(td.string)
ulist.append(ui)
#保存资源
def save_contents(urlist):
with open("D:/2016年中国企业500强排行榜.csv",'w') as f:
writer = csv.writer(f)
writer.writerow(['2016年中国企业500强排行榜'])
for i in range(len(urlist)):
writer.writerow([urlist[i][1],urlist[i][3],urlist[i][5]])
def main():
urli = []
url = "http://www.maigoo.com/news/463071.html"
rs = check_link(url)
get_contents(urli,rs)
save_contents(urli)
main()
运行结果:
以上是 python怎么爬取表格? 的全部内容, 来源链接: utcz.com/z/528058.html