selenium+bs4库爬取京东商城戴尔显示器数据

def main():

from spider_re import get_html, get_info, save_info

url = 'https://item.jd.com/2316993.html#comment'

html_list = get_html.get_html(url, 20)

data = get_info.get_info(html_list)

print('一共抓取{}条数据'.format(len(data)))

print('数据正在保存中,请稍等。。。。')

save_info.save_info(data)

if __name__ == '__main__':

main()

print('数据已经存储完毕')

def get_html(url,page_num):

"""

url: 要爬取的url

page_num: 要爬取的页数

"""

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

import time

# 创建chrome浏览器驱动,无头模式

chrome_options = Options()

chrome_options.add_argument('--headless')

# 谷歌文档提到需要加上这个属性来规避bug

chrome_options.add_argument('--disable-gpu')

driver = webdriver.Chrome(chrome_options=chrome_options)

html_list = []

# 跳转到指定界面

driver.get(url)

time.sleep(2)

# 点击商品评价

driver.find_element_by_xpath('//*[@id=\"detail\"]/div[1]/ul/li[5]').click()

time.sleep(1)

for i in range(page_num):

html_list.append(driver.page_source)

print('正在获取第{}个界面'.format(i+1))

time.sleep(2)

try:

# 点击下一页,以更新当前页面中的评价内容

driver.find_element_by_xpath('//div[@id=\"comment-0\"]/div[13]/div/div/a[@class=\"ui-pager-next\"]')

time.sleep(1)

except Exception as e:

print(e)

# 退出浏览器

driver.quit()

return html_list

def get_info(html_list):

""""使用bs库解析网页并提取数据"""

from bs4 import BeautifulSoup

total_info = []

for content in html_list:

soup = BeautifulSoup(content, 'html.parser')

try:

# 获取每一页的包含评价在内的10部分内容

datas = soup.select(r'#comment-0 > div.comment-item')

except Exception as e:

raise e

# 遍历每部分的内容

for data in datas:

try:

# 获取评论内容

comment_text = data.find_all('p', {'class': 'comment-con'})

# 判断是否有追加评论

if len(comment_text) == 2:

comment_append = comment_text[1].string

else:

comment_append = 'None '

except Exception as e:

raise e

# 评论时间

comment_time = data.select(r'div.comment-message > div.order-info > span')[1].get_text()

# 订单类型

order_type = data.select(r'div.comment-message > div.order-info > span')[0].get_text()

# 总数据

total_info.append([comment_time, order_type, comment_text[0].string, comment_append])

return total_info

def save_info(total_info):

"""导入到excel中去"""

import pandas as pd

dataseris = pd.DataFrame(total_info,columns=['评论时间', '订单类型', '评论内容', '追加评论'])

dataseris.to_excel('E:\\info.xlsx')

结果:

以上是 selenium+bs4库爬取京东商城戴尔显示器数据 的全部内容, 来源链接: utcz.com/a/52418.html

回到顶部