python 爬取网页内容

python

 1 #encoding:UTF-8

2 import urllib

3 import urllib.request

4 import bs4

5 from bs4 import BeautifulSoup as bs

6 def test1():

7 url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm"

8 resp = urllib.request.urlopen(url)

9 data = resp.read().decode('UTF-8')

10 soup = bs(data, 'html.parser')

11 segment11= soup.find_all('table')

12 segment1=segment11[7].find_all('tr')#表示第几个table,此时表示进去html网页中的第7个table

13

14

15 f2=open('./text1.txt','a',encoding='cp852')

16 for item in segment1:

17

18 print(item)

19 '''

20 <tr class="FrameTreeFont"><td><span class="FrameDrawFont">│

21 <span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span>

22 <a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a>

23 Damage</td><td align="right"><span class="FrameDetailFont"> ×1 

24 </span></td><td><span class="FrameDetailFont">(M)</span></td></tr>

25 '''

26

27 print(item.get_text())#以文本方式呈现

28 '''

29 │─│─├─DAM Damage ×1 (M)

30 '''

31 # print(item.td.span.get_text())#获取具体标签内部内容

32 print([text for text in item.stripped_strings] )#以列表方式呈现

33 '''

34 ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']

35 '''

36 '''

37 soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。

38

39 soup.get_text("|", strip=True)#u'I linked to|example.com'

40 '''

41 f2.writelines(str([text for text in item.stripped_strings])+'\n')

42 f2.close()

43 if __name__=='__main__':

44 test1()

以上是 python 爬取网页内容 的全部内容, 来源链接: utcz.com/z/389476.html

回到顶部