[Python]爬取新型冠状病毒2.2至今的所有数据 python .2.13
爬取网址http://hu.yixue99.com/2020/kszx_0205/27792.html
代码如下:
1 import requests2 from bs4 import BeautifulSoup
3
4 url="http://hu.yixue99.com/2020/kszx_0205/27792.html"
5 kv = {'user-agent': 'Mozilla/5.0'}
6
7 #爬取总览信息
8 def content():
9 url = "http://hu.yixue99.com/2020/kszx_0205/27792.html"
10 r=requests.get(url,headers=kv)
11 r.encoding=r.apparent_encoding
12 demo=r.text
13 soup=BeautifulSoup(demo,"html.parser")
14 print("开始")
15 #print(r.text)
16 num=0
17 texts=""
18 for s in soup.find_all("span",{"style":"font-size:14px;"}):
19 text=str(s.string).replace("时间(北京时间)", "").replace("确诊", "").replace("疑似", "").replace("死亡", "").replace("治愈","").replace("疫情详情", "").replace("点击查看", "")
20 if text!="":
21 num+=1
22 if num % 5 != 0:
23 texts += text + " "
24 else:
25 texts+=text
26 print(texts)
27 wtire_content(texts.replace("例","") + "\n")
28 texts=""
29
30
31 #爬取链接
32 def href():
33 url = "http://hu.yixue99.com/2020/kszx_0205/27792.html"
34 r = requests.get(url, headers=kv)
35 r.encoding = r.apparent_encoding
36 demo = r.text
37 soup = BeautifulSoup(demo, "html.parser")
38 print("开始")
39 # print(r.text)
40 num = 0
41 texts = ""
42 for s in soup.find_all("span", {"style": "font-size:14px;"}):
43 if s.find("a") is not None:
44 href=str(s.find("a").attrs["href"])
45 print(href)
46 wtire_href(href+"\n")
47
48
49 #爬取内容
50 def content_day(url):
51 r = requests.get(url, headers=kv)
52 r.encoding = r.apparent_encoding
53 demo = r.text
54 soup = BeautifulSoup(demo, "html.parser")
55 print(url)
56 print("开始")
57 num = 0
58 texts = ""
59 one=0
60 time= str(soup.find("td",{"style" : "PADDING-BOTTOM: 0px; PADDING-TOP: 0px; PADDING-LEFT: 0px; MARGIN: 0px; PADDING-RIGHT: 0px"}).string).replace("各省疫情动态(截止至","").replace(" 10:00)","").replace("各省疫情动态(截止至","").replace(" 11:00)","")
61 print(time)
62 for s in soup.find_all("td",{"style" : "PADDING-BOTTOM: 0px; PADDING-TOP: 0px; PADDING-LEFT: 0px; MARGIN: 0px; PADDING-RIGHT: 0px"}):
63 text = str(s.string).replace("确诊", "").replace("疑似", "").replace("死亡", "").replace("治愈", "").replace(" ", "").replace("省份", "")
64 if one==0:
65 one+=1
66 else:
67 if text !="":
68 num+=1
69 if num % 5 != 0:
70 texts += text + " "
71 else:
72 texts += text
73 print(time+texts)
74 write_content_day(time+" "+texts+"\n")
75 texts = ""
76
77 #写入总览信息
78 def wtire_content(contents):
79 f=open("E:/bingducsv/bingdusum.txt" , "a+" , encoding="utf-8")
80 f.write(contents)
81 f.close()
82
83 #写入每日的链接
84 def wtire_href(contents):
85 f = open("E:/bingducsv/bingduhref.txt", "a+", encoding="utf-8")
86 f.write(contents)
87 f.close()
88
89 def read():
90 f = open("E:/bingducsv/bingduhref.txt", "r+", encoding="utf-8")
91 for line in f:
92 line=line.rstrip("\n")
93 url=line
94 content_day(url)
95
96 def write_content_day(contents):
97 f = open("E:/bingducsv/bingduday.txt", "a+", encoding="utf-8")
98 f.write(contents)
99 f.close()
100
101
102 if __name__=="__main__":
103 content()
104 href()
105 read()
以上是 [Python]爬取新型冠状病毒2.2至今的所有数据 python .2.13 的全部内容, 来源链接: utcz.com/z/389408.html