Python爬虫【实战篇】百度贴吧爬取页面存到本地

python

先上代码

import requests

class TiebaSpider:

def __init__(self, tieba_name):

self.tieba_name = tieba_name

self.url_temp = " https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"

self.headers = {

"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"

}

def get_url_list(self):

"""构造url列表"""

return [self.url_temp.format(i * 50) for i in range(50)]

def parse_url(self, url):

"""发送请求 获取响应"""

print(url)

response = requests.get(url=url, headers=self.headers)

return response.content

def save_html(self, html_str, page_num):

# 构建文件名

file_path = "{}第{}页.html".format(self.tieba_name, page_num)

with open(file_path, "wb") as f:

f.write(html_str)

def run(self):

"""实现主要逻辑"""

url_list = self.get_url_list()

# 遍历请求

for url in url_list:

html_str = self.parse_url(url=url)

# 构建页码

page_num = url_list.index(url) + 1

# 创建html文件

self.save_html(html_str, page_num)

if __name__ == '__main__':

tieba_spider = TiebaSpider("lol")

tieba_spider.run()

以上是 Python爬虫【实战篇】百度贴吧爬取页面存到本地 的全部内容, 来源链接: utcz.com/z/387148.html

回到顶部