Python3 爬取信息中josn.loads解码报错

Z时代
2024-03-09
分类：IT

背景介绍：爬取网站疫情信息本身符合json格式，直接json.loads报错。

1. 报错代码

Traceback (most recent call last):
  File "D:\Users\shishengchen\PycharmProjects\pythonProject\COVID-19-Data\Test1.py", line 38, in <module>
    getTheList("https://ncov.dxy.cn/ncovh5/view/pneumonia")
  File "D:\Users\shishengchen\PycharmProjects\pythonProject\COVID-19-Data\Test1.py", line 26, in getTheList
    worldDataJson=json.loads(worldDatastr)
  File "D:\Users\shishengchen\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "D:\Users\shishengchen\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "D:\Users\shishengchen\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

2. Python代码

import requests
import re
import json
from bs4 import BeautifulSoup
def getOriHtmlText(url,code='utf-8'):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
        }
        r=requests.get(url,timeout=30,headers=headers)
        r.raise_for_status()
        r.encoding=code
        return r.text
    except:
        return "There are some errors when get the original html!"
def getTheList(url):
    html=getOriHtmlText(url)
    soup=BeautifulSoup(html,'html.parser')
    htmlBodyText=soup.body.text
    print(soup.body)
    # 获取国家数据
    worldDataText=htmlBodyText[htmlBodyText.find('window.getListByCountryTypeService2true = '):]
    worldDatastr = worldDataText[worldDataText.find('[{'):worldDataText.find('}catch')]
    worldDataJson=json.loads(worldDatastr)
    with open("D:/Users/shishengchen/PycharmProjects/pythonProject/COVID-19-Data/data/worldData.json","w") as f:
        json.dump(worldDataJson,f)
        print("写入国家数据文件成功！")
    # 获取各省份数据
    provinceDataText = htmlBodyText[htmlBodyText.find('window.fetchRecentStatV2  = '):]
    provinceDatastr = provinceDataText[provinceDataText.find('[{'):provinceDataText.find('}catch')]
    provinceDataJson=json.loads(provinceDatastr)
    with open("D:/Users/shishengchen/PycharmProjects/pythonProject/COVID-19-Data/data/provinceData.json", "w") as f:
        json.dump(provinceDataJson,f)
        print("写入省份数据文件成功！")getTheList("https://ncov.dxy.cn/ncovh5/view/pneumonia")

回答：

按错误提示，第26行

worldDataJson=json.loads(worldDatastr)

执行出错，我打印了一下，上一行获取 worldDatastr 没有成功，值为空，导致json.loads错误，没有数据可以解析

看了你的代码仅仅是字符串截取，没有必要用BeautifulSoup，你直接用 htmlBodyText = html，使用获取的html结果就可以了。

import requests
import json
from bs4 import BeautifulSoup
def getOriHtmlText(url,code='utf-8'):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
        }
        r=requests.get(url,timeout=30,headers=headers)
        r.raise_for_status()
        r.encoding=code
        return r.text
    except:
        return "There are some errors when get the original html!"
def getTheList(url):
    html=getOriHtmlText(url)
    soup=BeautifulSoup(html,'html.parser')
    htmlBodyText=soup.body.text
    htmlBodyText = html
    print(soup.body)
    # fetch country data
    print(htmlBodyText)
    worldDataText=htmlBodyText[htmlBodyText.find('window.getListByCountryTypeService2true = '):]
    worldDatastr = worldDataText[worldDataText.find('[{'):worldDataText.find('}catch')]
    worldDataJson=json.loads(worldDatastr)
    with open("worldData.json","w") as f:
        json.dump(worldDataJson,f)
        print("write country data success")
    # fetch province data
    provinceDataText = htmlBodyText[htmlBodyText.find('window.fetchRecentStatV2 = '):]
    provinceDatastr = provinceDataText[provinceDataText.find('[{'):provinceDataText.find('}catch')]
    provinceDataJson=json.loads(provinceDatastr)
    with open("provinceData.json", "w") as f:
        json.dump(provinceDataJson,f)
        print("write province data success")getTheList("https://ncov.dxy.cn/ncovh5/view/pneumonia")