python学习之小说爬虫

python

 1 # coding:utf8

2 from multiprocessing.dummy import Pool as ThreadPool

3 import multiprocessing

4 import requests, os, codecs, time

5 from lxml import etree

6

7 url = \'https://www.biquge5200.cc/79_79883/\' # 要下载的小说章节列表页面url

8

9

10 def getsource(url):

11 try:

12 s = requests.get(url)

13 except:

14 print(\'访问异常,跳过~!\')

15 else:

16 s.encoding = \'gbk\'

17 return s.text

18

19

20 def getlist(url):

21 global txtname, txtzz

22 #解析地址

23 html = getsource(url)

24 ehtml = etree.HTML(html)

25 u = ehtml.xpath(\'//*[@id="list"]/dl/dd/a/@href\')

26 t = ehtml.xpath(\'//*[@id="list"]/dl/dd/a/text()\')

27 txtname = ehtml.xpath(\'//*[@id="info"]/h1/text()\')[0].replace(\'\\\', \'\').replace(\'/\', \'\').replace(\':\', \'\').replace(

28 \'*\', \'\').replace(\'?\', \'\').replace(\'"\', \'\').replace(\'<\', \'\').replace(\'>\', \'\').replace(\'|\', \'\')

29 txtzz = ehtml.xpath(\'//*[@id="info"]/p[1]/text()\')[0].replace(\'\xa0\', \'\')

30 num = 0

31 #循环urllist

32 for i in range(9, len(u)):

33 urllist.append(u[i] + \'|\' + t[i] + \'|\' + str(num))

34 print(urllist)

35 print(u[i] + \'|\' + t[i] + \'|\' + str(num))

36 num += 1

37

38

39 def downtxt(url):

40 global downcount

41 u = url.split(\'|\')[0]

42 t = url.split(\'|\')[1]

43 num = url.split(\'|\')[2]

44 content = \'\'

45 while len(content) == 0:

46 html = getsource(u)

47 ehtml = etree.HTML(html)

48 content = ehtml.xpath(\'string(//*[@id="content"])\').replace(\' \', \'\r\n\').replace(\'  \', \'\r\n\').replace(

49 \'\xa0\', \'\').replace(\'\ufffd\', \'\').replace(\'\u266a\', \'\').replace(\'readx;\', \'\')

50 if os.path.exists(savepath + num + \'.txt\'):

51 print(num + \'.txt 已经存在!\')

52 else:

53 with codecs.open(savepath + num + \'.txt\', \'a\')as f:

54 f.write(\'\r\n\' + t + \'\r\n\' + content)

55 print(t + \' 下载完成!\')

56 downcount += 1

57

58

59 time_start = time.time();

60 downcount = 0

61 urllist = []

62 getlist(url)

63 savepath = os.getcwd() + \'\\\' + txtname + \'\\\'

64 if os.path.exists(savepath) == False:

65 os.makedirs(savepath)

66 pool = ThreadPool(multiprocessing.cpu_count())

67 results = pool.map(downtxt, urllist)

68 pool.close()

69 pool.join()

70 print(\'开始合并txt...\')

71 with codecs.open(savepath + txtname + \'.txt\', \'a\')as f:

72 f.write(txtname)

73 f.write(\'\r\n\')

74 f.write(txtzz)

75 f.write(\'\r\n\')

76 for i in range(0, len(urllist)):

77 with open(savepath + str(i) + \'.txt\', "r") as fr:

78 txt = fr.read()

79 f.write(txt)

80 f.write(\'===========================\')

81 fr.close()

82 os.remove(savepath + str(i) + \'.txt\')

83 print(\'小说合并完成~!\')

84

85 print(\'\')

86 print(\'*\' * 15 + \' 任务完成,结果如下:\' + \'*\' * 15)

87 print(\'\')

88 print(\'<\' + txtname + \'> 下载完成\' + \',获取并下载章节页面:\' + str(downcount) + \' 个\')

89 print(\'\')

90 print(\'耗时:\' + str(time.time() - time_start) + \' s\')

91 print(\'\')

92 print(\'*\' * 51)

 

以上是 python学习之小说爬虫 的全部内容, 来源链接: utcz.com/z/388797.html

回到顶部