[Python]爬取新型冠状病毒2.2至今的所有数据 python .2.13

python

爬取网址http://hu.yixue99.com/2020/kszx_0205/27792.html

代码如下:

  1 import requests

2 from bs4 import BeautifulSoup

3

4 url="http://hu.yixue99.com/2020/kszx_0205/27792.html"

5 kv = {'user-agent': 'Mozilla/5.0'}

6

7 #爬取总览信息

8 def content():

9 url = "http://hu.yixue99.com/2020/kszx_0205/27792.html"

10 r=requests.get(url,headers=kv)

11 r.encoding=r.apparent_encoding

12 demo=r.text

13 soup=BeautifulSoup(demo,"html.parser")

14 print("开始")

15 #print(r.text)

16 num=0

17 texts=""

18 for s in soup.find_all("span",{"style":"font-size:14px;"}):

19 text=str(s.string).replace("时间(北京时间)", "").replace("确诊", "").replace("疑似", "").replace("死亡", "").replace("治愈","").replace("疫情详情", "").replace("点击查看", "")

20 if text!="":

21 num+=1

22 if num % 5 != 0:

23 texts += text + " "

24 else:

25 texts+=text

26 print(texts)

27 wtire_content(texts.replace("例","") + "\n")

28 texts=""

29

30

31 #爬取链接

32 def href():

33 url = "http://hu.yixue99.com/2020/kszx_0205/27792.html"

34 r = requests.get(url, headers=kv)

35 r.encoding = r.apparent_encoding

36 demo = r.text

37 soup = BeautifulSoup(demo, "html.parser")

38 print("开始")

39 # print(r.text)

40 num = 0

41 texts = ""

42 for s in soup.find_all("span", {"style": "font-size:14px;"}):

43 if s.find("a") is not None:

44 href=str(s.find("a").attrs["href"])

45 print(href)

46 wtire_href(href+"\n")

47

48

49 #爬取内容

50 def content_day(url):

51 r = requests.get(url, headers=kv)

52 r.encoding = r.apparent_encoding

53 demo = r.text

54 soup = BeautifulSoup(demo, "html.parser")

55 print(url)

56 print("开始")

57 num = 0

58 texts = ""

59 one=0

60 time= str(soup.find("td",{"style" : "PADDING-BOTTOM: 0px; PADDING-TOP: 0px; PADDING-LEFT: 0px; MARGIN: 0px; PADDING-RIGHT: 0px"}).string).replace("各省疫情动态(截止至","").replace(" 10:00)","").replace("各省疫情动态(截止至","").replace(" 11:00)","")

61 print(time)

62 for s in soup.find_all("td",{"style" : "PADDING-BOTTOM: 0px; PADDING-TOP: 0px; PADDING-LEFT: 0px; MARGIN: 0px; PADDING-RIGHT: 0px"}):

63 text = str(s.string).replace("确诊", "").replace("疑似", "").replace("死亡", "").replace("治愈", "").replace(" ", "").replace("省份", "")

64 if one==0:

65 one+=1

66 else:

67 if text !="":

68 num+=1

69 if num % 5 != 0:

70 texts += text + " "

71 else:

72 texts += text

73 print(time+texts)

74 write_content_day(time+" "+texts+"\n")

75 texts = ""

76

77 #写入总览信息

78 def wtire_content(contents):

79 f=open("E:/bingducsv/bingdusum.txt" , "a+" , encoding="utf-8")

80 f.write(contents)

81 f.close()

82

83 #写入每日的链接

84 def wtire_href(contents):

85 f = open("E:/bingducsv/bingduhref.txt", "a+", encoding="utf-8")

86 f.write(contents)

87 f.close()

88

89 def read():

90 f = open("E:/bingducsv/bingduhref.txt", "r+", encoding="utf-8")

91 for line in f:

92 line=line.rstrip("\n")

93 url=line

94 content_day(url)

95

96 def write_content_day(contents):

97 f = open("E:/bingducsv/bingduday.txt", "a+", encoding="utf-8")

98 f.write(contents)

99 f.close()

100

101

102 if __name__=="__main__":

103 content()

104 href()

105 read()

以上是 [Python]爬取新型冠状病毒2.2至今的所有数据 python .2.13 的全部内容, 来源链接: utcz.com/z/389408.html

回到顶部