python-requests爬取数据保存至数据库

python

 1 import requests

2 from fake_useragent import UserAgent

3 from lxml import etree

4 import pymysql

5

6

7 conn = pymysql.connect(host='47.107.190.1', port=3306, user='zhangsan',

8 password='Changeme_123', database='qiubai', charset='utf8')

9 # 创建游标

10 cursor = conn.cursor()

11

12 url = 'https://www.qiushibaike.com/text/'

13

14 ua = UserAgent()

15

16 headers = {

17 'User-Agent': ua.random

18 }

19

20 response = requests.get(url=url, headers=headers)

21 # print(response.text)

22

23 # 解析数据

24 html = etree.HTML(response.text)

25 # 先拿到存放段子的div

26 divs = html.xpath('//div[contains(@id, "qiushi_tag_")]')

27 # print(len(divs))

28 # 循环从每一个div中提取需要的信息

29 for div in divs:

30 # 定位到具体元素之后, 在当前元素下提取内容,一定要写./

31 img = div.xpath('.//div[contains(@class, "author")]/a[1]/img/@src')[0]

32 author = div.xpath('string(.//div[contains(@class, "author")]/a[2])').strip()

33 detail_url = 'https://www.qiushibaike.com' + div.xpath('.//a[@class="contentHerf"]/@href')[0]

34 # print(detail_url)

35 # 进入详情页,爬取详情页内容

36 detail_response = requests.get(url=detail_url, headers=headers)

37 detail_html = etree.HTML(detail_response.text)

38 content = detail_html.xpath('string(//div[@class="content"])').strip()

39 # print(content)

40

41 # 把爬下来的内容存入到数据库.

42 try:

43 sql = 'insert into duanzi(img, author, detail_url, content) values("%s", "%s", "%s", "%s")'

44 cursor.execute(sql, (img, author, detail_url, content))

45 # 提交

46 conn.commit()

47 except Exception as e:

48 print(e)

49 conn.rollback()

50

51 cursor.close()

52 conn.close()

以上是 python-requests爬取数据保存至数据库 的全部内容, 来源链接: utcz.com/z/387981.html

回到顶部