python爬虫笔记(六)网络爬虫之实战(2)——股票数据定向爬虫

python

1. 股票数据定向爬虫

https://gupiao.baidu.com/stock

http://quote.eastmoney.com/stock_list.html

2. 实例编写

2.1 获取HTML页面

def getHTMLText(url):

try:

r = requests.get(url, timeout = 30)

r.raise_for_status()

r.encoding = r.apparent_encoding

print("url:", r.request.url)

return r.text

except:

return ""

2.2 获取股票列表信息(bs4+正则)

def getStockList(lst, stockURL):

html = getHTMLText(stockURL)

soup = BeautifulSoup(html, \'html.parser\')

# 个股链接在 <a> 标签中

a = soup.find_all(\'a\')

for i in a:

try:

# 个股链接在 <a> 标签的 href 属性中

# 我们需要获得 sh/sz + 6 个数字,利用正则表达式

href = i.attrs[\'href\']

lst.append(re.findall(r"[s][hz]\d{6}", href)[0])

except:

continue

print(lst)

2.3 获取股票信息主体

def getStockInfo(lst, stockURL, fpath):

for stock in lst:

url = stockURL + stock + ".html"

html = getHTMLText(url)

try:

if html == "":

continue

# 使用键值对记录个股信息

infoDict = {}

soup = BeautifulSoup(html, \'html.parser\')

stockInfo = soup.find(\'div\', attrs={\'class\':\'stock-bets\'})

# 获取股票名称

name = stockInfo.find_all(attrs={\'class\':\'bets-name\'})[0]

# 将信息添加到字典中

# split()以空格分隔

infoDict.update({\'股票名称\' : name.text.split()[0]})

# 在 <dt><dd> 标签中获取其他信息,用键值对维护

keyList = stockInfo.find_all(\'dt\')

valueList = stockInfo.find_all(\'dd\')

for i in range(len(keyList)):

key = keyList[i].text

val = valueList[i].text

infoDict[key] = val

# 将获得额信息写入相应文件中

with open(fpath, \'a\', encoding=\'utf-8\') as f:

f.write(str(infoDict) + \'\n\')

# 利用 traceback 跟踪并输出异常信息

except:

traceback.print_exc()

continue

3. 完整代码

# -*- coding: utf-8 -*-

"""

Created on Sat Feb 1 00:40:47 2020

@author: douzi

"""

import requests

from bs4 import BeautifulSoup

# traceback模块被用来跟踪异常返回信息

import traceback

import re

def getHTMLText(url, code = \'utf-8\'):

try:

r = requests.get(url, timeout = 30)

r.raise_for_status()

# r.encoding = r.apparent_encoding # 定向爬虫可以直接固定

r.encoding = code

print("url:", r.request.url)

return r.text

except:

return ""

def getStockList(lst, stockURL):

html = getHTMLText(stockURL, "GB2312")

soup = BeautifulSoup(html, \'html.parser\')

# 个股链接在 <a> 标签中

a = soup.find_all(\'a\')

for i in a:

try:

# 个股链接在 <a> 标签的 href 属性中

# 我们需要获得 sh/sz + 6 个数字,利用正则表达式

href = i.attrs[\'href\']

lst.append(re.findall(r"[s][hz]\d{6}", href)[0])

except:

continue

print(lst)

def getStockInfo(lst, stockURL, fpath):

count = 0

for stock in lst:

url = stockURL + stock + ".html"

html = getHTMLText(url)

try:

if html == "":

continue

# 使用键值对记录个股信息

infoDict = {}

soup = BeautifulSoup(html, \'html.parser\')

stockInfo = soup.find(\'div\', attrs={\'class\':\'stock-bets\'})

# 获取股票名称

name = stockInfo.find_all(attrs={\'class\':\'bets-name\'})[0]

# 将信息添加到字典中

# split()以空格分隔

infoDict.update({\'股票名称\' : name.text.split()[0]})

# 在 <dt><dd> 标签中获取其他信息,用键值对维护

keyList = stockInfo.find_all(\'dt\')

valueList = stockInfo.find_all(\'dd\')

for i in range(len(keyList)):

key = keyList[i].text

val = valueList[i].text

infoDict[key] = val

# 将获得额信息写入相应文件中

with open(fpath, \'a\', encoding=\'utf-8\') as f:

f.write(str(infoDict) + \'\n\')

count = count + 1

# \r

print(\'\r当前速度: {:.2f}%\'.format(count * 100 / len(lst)), end=\'\')

# 利用 traceback 跟踪并输出异常信息

except:

count = count + 1

print(\'\r当前速度: {:.2f}%\'.format(count * 100 / len(lst)), end=\'\')

traceback.print_exc()

continue

def main():

# 股票列表信息

stock_list_url = "http://quote.eastmoney.com/stock_list.html"

# 股票信息主体

stock_info_url = "https://gupiao.baidu.com/stock/"

output_file = ".//Result_stock.txt"

slist = []

# 获取股票列表信息

getStockList(slist, stock_list_url)

# 获得股票信息主体

getStockInfo(slist, stock_info_url, output_file)

if __name__ == "__main__":

main()

 

以上是 python爬虫笔记(六)网络爬虫之实战(2)——股票数据定向爬虫 的全部内容, 来源链接: utcz.com/z/386940.html

回到顶部