python学习之小说爬虫
1 # coding:utf82 from multiprocessing.dummy import Pool as ThreadPool
3 import multiprocessing
4 import requests, os, codecs, time
5 from lxml import etree
6
7 url = \'https://www.biquge5200.cc/79_79883/\' # 要下载的小说章节列表页面url
8
9
10 def getsource(url):
11 try:
12 s = requests.get(url)
13 except:
14 print(\'访问异常,跳过~!\')
15 else:
16 s.encoding = \'gbk\'
17 return s.text
18
19
20 def getlist(url):
21 global txtname, txtzz
22 #解析地址
23 html = getsource(url)
24 ehtml = etree.HTML(html)
25 u = ehtml.xpath(\'//*[@id="list"]/dl/dd/a/@href\')
26 t = ehtml.xpath(\'//*[@id="list"]/dl/dd/a/text()\')
27 txtname = ehtml.xpath(\'//*[@id="info"]/h1/text()\')[0].replace(\'\\\', \'\').replace(\'/\', \'\').replace(\':\', \'\').replace(
28 \'*\', \'\').replace(\'?\', \'\').replace(\'"\', \'\').replace(\'<\', \'\').replace(\'>\', \'\').replace(\'|\', \'\')
29 txtzz = ehtml.xpath(\'//*[@id="info"]/p[1]/text()\')[0].replace(\'\xa0\', \'\')
30 num = 0
31 #循环urllist
32 for i in range(9, len(u)):
33 urllist.append(u[i] + \'|\' + t[i] + \'|\' + str(num))
34 print(urllist)
35 print(u[i] + \'|\' + t[i] + \'|\' + str(num))
36 num += 1
37
38
39 def downtxt(url):
40 global downcount
41 u = url.split(\'|\')[0]
42 t = url.split(\'|\')[1]
43 num = url.split(\'|\')[2]
44 content = \'\'
45 while len(content) == 0:
46 html = getsource(u)
47 ehtml = etree.HTML(html)
48 content = ehtml.xpath(\'string(//*[@id="content"])\').replace(\' \', \'\r\n\').replace(\' \', \'\r\n\').replace(
49 \'\xa0\', \'\').replace(\'\ufffd\', \'\').replace(\'\u266a\', \'\').replace(\'readx;\', \'\')
50 if os.path.exists(savepath + num + \'.txt\'):
51 print(num + \'.txt 已经存在!\')
52 else:
53 with codecs.open(savepath + num + \'.txt\', \'a\')as f:
54 f.write(\'\r\n\' + t + \'\r\n\' + content)
55 print(t + \' 下载完成!\')
56 downcount += 1
57
58
59 time_start = time.time();
60 downcount = 0
61 urllist = []
62 getlist(url)
63 savepath = os.getcwd() + \'\\\' + txtname + \'\\\'
64 if os.path.exists(savepath) == False:
65 os.makedirs(savepath)
66 pool = ThreadPool(multiprocessing.cpu_count())
67 results = pool.map(downtxt, urllist)
68 pool.close()
69 pool.join()
70 print(\'开始合并txt...\')
71 with codecs.open(savepath + txtname + \'.txt\', \'a\')as f:
72 f.write(txtname)
73 f.write(\'\r\n\')
74 f.write(txtzz)
75 f.write(\'\r\n\')
76 for i in range(0, len(urllist)):
77 with open(savepath + str(i) + \'.txt\', "r") as fr:
78 txt = fr.read()
79 f.write(txt)
80 f.write(\'===========================\')
81 fr.close()
82 os.remove(savepath + str(i) + \'.txt\')
83 print(\'小说合并完成~!\')
84
85 print(\'\')
86 print(\'*\' * 15 + \' 任务完成,结果如下:\' + \'*\' * 15)
87 print(\'\')
88 print(\'<\' + txtname + \'> 下载完成\' + \',获取并下载章节页面:\' + str(downcount) + \' 个\')
89 print(\'\')
90 print(\'耗时:\' + str(time.time() - time_start) + \' s\')
91 print(\'\')
92 print(\'*\' * 51)
以上是 python学习之小说爬虫 的全部内容, 来源链接: utcz.com/z/388797.html