python爬虫笔记(六)网络爬虫之实战(2)——股票数据定向爬虫
1. 股票数据定向爬虫
https://gupiao.baidu.com/stock
http://quote.eastmoney.com/stock_list.html
2. 实例编写
2.1 获取HTML页面
def getHTMLText(url):try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
print("url:", r.request.url)
return r.text
except:
return ""
2.2 获取股票列表信息(bs4+正则)
def getStockList(lst, stockURL):html = getHTMLText(stockURL)
soup = BeautifulSoup(html, \'html.parser\')
# 个股链接在 <a> 标签中
a = soup.find_all(\'a\')
for i in a:
try:
# 个股链接在 <a> 标签的 href 属性中
# 我们需要获得 sh/sz + 6 个数字,利用正则表达式
href = i.attrs[\'href\']
lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
except:
continue
print(lst)
2.3 获取股票信息主体
def getStockInfo(lst, stockURL, fpath):for stock in lst:
url = stockURL + stock + ".html"
html = getHTMLText(url)
try:
if html == "":
continue
# 使用键值对记录个股信息
infoDict = {}
soup = BeautifulSoup(html, \'html.parser\')
stockInfo = soup.find(\'div\', attrs={\'class\':\'stock-bets\'})
# 获取股票名称
name = stockInfo.find_all(attrs={\'class\':\'bets-name\'})[0]
# 将信息添加到字典中
# split()以空格分隔
infoDict.update({\'股票名称\' : name.text.split()[0]})
# 在 <dt><dd> 标签中获取其他信息,用键值对维护
keyList = stockInfo.find_all(\'dt\')
valueList = stockInfo.find_all(\'dd\')
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
infoDict[key] = val
# 将获得额信息写入相应文件中
with open(fpath, \'a\', encoding=\'utf-8\') as f:
f.write(str(infoDict) + \'\n\')
# 利用 traceback 跟踪并输出异常信息
except:
traceback.print_exc()
continue
3. 完整代码
# -*- coding: utf-8 -*-"""
Created on Sat Feb 1 00:40:47 2020
@author: douzi
"""
import requests
from bs4 import BeautifulSoup
# traceback模块被用来跟踪异常返回信息
import traceback
import re
def getHTMLText(url, code = \'utf-8\'):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
# r.encoding = r.apparent_encoding # 定向爬虫可以直接固定
r.encoding = code
print("url:", r.request.url)
return r.text
except:
return ""
def getStockList(lst, stockURL):
html = getHTMLText(stockURL, "GB2312")
soup = BeautifulSoup(html, \'html.parser\')
# 个股链接在 <a> 标签中
a = soup.find_all(\'a\')
for i in a:
try:
# 个股链接在 <a> 标签的 href 属性中
# 我们需要获得 sh/sz + 6 个数字,利用正则表达式
href = i.attrs[\'href\']
lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
except:
continue
print(lst)
def getStockInfo(lst, stockURL, fpath):
count = 0
for stock in lst:
url = stockURL + stock + ".html"
html = getHTMLText(url)
try:
if html == "":
continue
# 使用键值对记录个股信息
infoDict = {}
soup = BeautifulSoup(html, \'html.parser\')
stockInfo = soup.find(\'div\', attrs={\'class\':\'stock-bets\'})
# 获取股票名称
name = stockInfo.find_all(attrs={\'class\':\'bets-name\'})[0]
# 将信息添加到字典中
# split()以空格分隔
infoDict.update({\'股票名称\' : name.text.split()[0]})
# 在 <dt><dd> 标签中获取其他信息,用键值对维护
keyList = stockInfo.find_all(\'dt\')
valueList = stockInfo.find_all(\'dd\')
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
infoDict[key] = val
# 将获得额信息写入相应文件中
with open(fpath, \'a\', encoding=\'utf-8\') as f:
f.write(str(infoDict) + \'\n\')
count = count + 1
# \r
print(\'\r当前速度: {:.2f}%\'.format(count * 100 / len(lst)), end=\'\')
# 利用 traceback 跟踪并输出异常信息
except:
count = count + 1
print(\'\r当前速度: {:.2f}%\'.format(count * 100 / len(lst)), end=\'\')
traceback.print_exc()
continue
def main():
# 股票列表信息
stock_list_url = "http://quote.eastmoney.com/stock_list.html"
# 股票信息主体
stock_info_url = "https://gupiao.baidu.com/stock/"
output_file = ".//Result_stock.txt"
slist = []
# 获取股票列表信息
getStockList(slist, stock_list_url)
# 获得股票信息主体
getStockInfo(slist, stock_info_url, output_file)
if __name__ == "__main__":
main()
以上是 python爬虫笔记(六)网络爬虫之实战(2)——股票数据定向爬虫 的全部内容, 来源链接: utcz.com/z/386940.html