Python3 爬取信息中josn.loads解码报错

Python3 爬取信息中josn.loads解码报错

背景介绍:爬取网站疫情信息本身符合json格式,直接json.loads报错。

1. 报错代码

Traceback (most recent call last):

File "D:\Users\shishengchen\PycharmProjects\pythonProject\COVID-19-Data\Test1.py", line 38, in <module>

getTheList("https://ncov.dxy.cn/ncovh5/view/pneumonia")

File "D:\Users\shishengchen\PycharmProjects\pythonProject\COVID-19-Data\Test1.py", line 26, in getTheList

worldDataJson=json.loads(worldDatastr)

File "D:\Users\shishengchen\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 346, in loads

return _default_decoder.decode(s)

File "D:\Users\shishengchen\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 337, in decode

obj, end = self.raw_decode(s, idx=_w(s, 0).end())

File "D:\Users\shishengchen\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 355, in raw_decode

raise JSONDecodeError("Expecting value", s, err.value) from None

json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

2. Python代码

import requests

import re

import json

from bs4 import BeautifulSoup

def getOriHtmlText(url,code='utf-8'):

try:

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'

}

r=requests.get(url,timeout=30,headers=headers)

r.raise_for_status()

r.encoding=code

return r.text

except:

return "There are some errors when get the original html!"

def getTheList(url):

html=getOriHtmlText(url)

soup=BeautifulSoup(html,'html.parser')

htmlBodyText=soup.body.text

print(soup.body)

# 获取国家数据

worldDataText=htmlBodyText[htmlBodyText.find('window.getListByCountryTypeService2true = '):]

worldDatastr = worldDataText[worldDataText.find('[{'):worldDataText.find('}catch')]

worldDataJson=json.loads(worldDatastr)

with open("D:/Users/shishengchen/PycharmProjects/pythonProject/COVID-19-Data/data/worldData.json","w") as f:

json.dump(worldDataJson,f)

print("写入国家数据文件成功!")

# 获取各省份数据

provinceDataText = htmlBodyText[htmlBodyText.find('window.fetchRecentStatV2 = '):]

provinceDatastr = provinceDataText[provinceDataText.find('[{'):provinceDataText.find('}catch')]

provinceDataJson=json.loads(provinceDatastr)

with open("D:/Users/shishengchen/PycharmProjects/pythonProject/COVID-19-Data/data/provinceData.json", "w") as f:

json.dump(provinceDataJson,f)

print("写入省份数据文件成功!")

getTheList("https://ncov.dxy.cn/ncovh5/view/pneumonia")


回答:

按错误提示,第26行

worldDataJson=json.loads(worldDatastr)

执行出错,我打印了一下,上一行 获取 worldDatastr 没有成功,值为空,导致json.loads错误,没有数据可以解析


看了你的代码仅仅是字符串截取,没有必要用BeautifulSoup,你直接用 htmlBodyText = html,使用获取的html结果就可以了。

import requests

import json

from bs4 import BeautifulSoup

def getOriHtmlText(url,code='utf-8'):

try:

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'

}

r=requests.get(url,timeout=30,headers=headers)

r.raise_for_status()

r.encoding=code

return r.text

except:

return "There are some errors when get the original html!"

def getTheList(url):

html=getOriHtmlText(url)

soup=BeautifulSoup(html,'html.parser')

htmlBodyText=soup.body.text

htmlBodyText = html

print(soup.body)

# fetch country data

print(htmlBodyText)

worldDataText=htmlBodyText[htmlBodyText.find('window.getListByCountryTypeService2true = '):]

worldDatastr = worldDataText[worldDataText.find('[{'):worldDataText.find('}catch')]

worldDataJson=json.loads(worldDatastr)

with open("worldData.json","w") as f:

json.dump(worldDataJson,f)

print("write country data success")

# fetch province data

provinceDataText = htmlBodyText[htmlBodyText.find('window.fetchRecentStatV2 = '):]

provinceDatastr = provinceDataText[provinceDataText.find('[{'):provinceDataText.find('}catch')]

provinceDataJson=json.loads(provinceDatastr)

with open("provinceData.json", "w") as f:

json.dump(provinceDataJson,f)

print("write province data success")

getTheList("https://ncov.dxy.cn/ncovh5/view/pneumonia")

以上是 Python3 爬取信息中josn.loads解码报错 的全部内容, 来源链接: utcz.com/p/938174.html

回到顶部