Python3 爬取信息中josn.loads解码报错
背景介绍:爬取网站疫情信息本身符合json格式,直接json.loads报错。
1. 报错代码
Traceback (most recent call last): File "D:\Users\shishengchen\PycharmProjects\pythonProject\COVID-19-Data\Test1.py", line 38, in <module>
getTheList("https://ncov.dxy.cn/ncovh5/view/pneumonia")
File "D:\Users\shishengchen\PycharmProjects\pythonProject\COVID-19-Data\Test1.py", line 26, in getTheList
worldDataJson=json.loads(worldDatastr)
File "D:\Users\shishengchen\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "D:\Users\shishengchen\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "D:\Users\shishengchen\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
2. Python代码
import requestsimport re
import json
from bs4 import BeautifulSoup
def getOriHtmlText(url,code='utf-8'):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
r=requests.get(url,timeout=30,headers=headers)
r.raise_for_status()
r.encoding=code
return r.text
except:
return "There are some errors when get the original html!"
def getTheList(url):
html=getOriHtmlText(url)
soup=BeautifulSoup(html,'html.parser')
htmlBodyText=soup.body.text
print(soup.body)
# 获取国家数据
worldDataText=htmlBodyText[htmlBodyText.find('window.getListByCountryTypeService2true = '):]
worldDatastr = worldDataText[worldDataText.find('[{'):worldDataText.find('}catch')]
worldDataJson=json.loads(worldDatastr)
with open("D:/Users/shishengchen/PycharmProjects/pythonProject/COVID-19-Data/data/worldData.json","w") as f:
json.dump(worldDataJson,f)
print("写入国家数据文件成功!")
# 获取各省份数据
provinceDataText = htmlBodyText[htmlBodyText.find('window.fetchRecentStatV2 = '):]
provinceDatastr = provinceDataText[provinceDataText.find('[{'):provinceDataText.find('}catch')]
provinceDataJson=json.loads(provinceDatastr)
with open("D:/Users/shishengchen/PycharmProjects/pythonProject/COVID-19-Data/data/provinceData.json", "w") as f:
json.dump(provinceDataJson,f)
print("写入省份数据文件成功!")
getTheList("https://ncov.dxy.cn/ncovh5/view/pneumonia")
回答:
按错误提示,第26行
worldDataJson=json.loads(worldDatastr)
执行出错,我打印了一下,上一行 获取 worldDatastr
没有成功,值为空,导致json.loads错误,没有数据可以解析
看了你的代码仅仅是字符串截取,没有必要用BeautifulSoup,你直接用 htmlBodyText = html,使用获取的html结果就可以了。
import requestsimport json
from bs4 import BeautifulSoup
def getOriHtmlText(url,code='utf-8'):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
r=requests.get(url,timeout=30,headers=headers)
r.raise_for_status()
r.encoding=code
return r.text
except:
return "There are some errors when get the original html!"
def getTheList(url):
html=getOriHtmlText(url)
soup=BeautifulSoup(html,'html.parser')
htmlBodyText=soup.body.text
htmlBodyText = html
print(soup.body)
# fetch country data
print(htmlBodyText)
worldDataText=htmlBodyText[htmlBodyText.find('window.getListByCountryTypeService2true = '):]
worldDatastr = worldDataText[worldDataText.find('[{'):worldDataText.find('}catch')]
worldDataJson=json.loads(worldDatastr)
with open("worldData.json","w") as f:
json.dump(worldDataJson,f)
print("write country data success")
# fetch province data
provinceDataText = htmlBodyText[htmlBodyText.find('window.fetchRecentStatV2 = '):]
provinceDatastr = provinceDataText[provinceDataText.find('[{'):provinceDataText.find('}catch')]
provinceDataJson=json.loads(provinceDatastr)
with open("provinceData.json", "w") as f:
json.dump(provinceDataJson,f)
print("write province data success")
getTheList("https://ncov.dxy.cn/ncovh5/view/pneumonia")
以上是 Python3 爬取信息中josn.loads解码报错 的全部内容, 来源链接: utcz.com/p/938174.html