python 爬取网页内容
1 #encoding:UTF-82 import urllib
3 import urllib.request
4 import bs4
5 from bs4 import BeautifulSoup as bs
6 def test1():
7 url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm"
8 resp = urllib.request.urlopen(url)
9 data = resp.read().decode('UTF-8')
10 soup = bs(data, 'html.parser')
11 segment11= soup.find_all('table')
12 segment1=segment11[7].find_all('tr')#表示第几个table,此时表示进去html网页中的第7个table
13
14
15 f2=open('./text1.txt','a',encoding='cp852')
16 for item in segment1:
17
18 print(item)
19 '''
20 <tr class="FrameTreeFont"><td><span class="FrameDrawFont">│
21 <span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span>
22 <a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a>
23 Damage</td><td align="right"><span class="FrameDetailFont"> ×1
24 </span></td><td><span class="FrameDetailFont">(M)</span></td></tr>
25 '''
26
27 print(item.get_text())#以文本方式呈现
28 '''
29 │─│─├─DAM Damage ×1 (M)
30 '''
31 # print(item.td.span.get_text())#获取具体标签内部内容
32 print([text for text in item.stripped_strings] )#以列表方式呈现
33 '''
34 ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
35 '''
36 '''
37 soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。
38
39 soup.get_text("|", strip=True)#u'I linked to|example.com'
40 '''
41 f2.writelines(str([text for text in item.stripped_strings])+'\n')
42 f2.close()
43 if __name__=='__main__':
44 test1()
以上是 python 爬取网页内容 的全部内容, 来源链接: utcz.com/z/389476.html