selenium+bs4库爬取京东商城戴尔显示器数据

Z时代
2024-01-10
分类：技术分享

def main():
from spider_re import get_html, get_info, save_info
url = 'https://item.jd.com/2316993.html#comment'
html_list = get_html.get_html(url, 20)
data = get_info.get_info(html_list)
print('一共抓取{}条数据'.format(len(data)))
print('数据正在保存中，请稍等。。。。')
save_info.save_info(data)
if __name__ == '__main__':
main()print('数据已经存储完毕')

def get_html(url,page_num):
"""
url: 要爬取的url
page_num: 要爬取的页数
"""
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
# 创建chrome浏览器驱动，无头模式
chrome_options = Options()
chrome_options.add_argument('--headless')
# 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
html_list = []
# 跳转到指定界面
driver.get(url)
time.sleep(2)
# 点击商品评价
driver.find_element_by_xpath('//*[@id=\"detail\"]/div[1]/ul/li[5]').click()
time.sleep(1)
for i in range(page_num):
html_list.append(driver.page_source)
print('正在获取第{}个界面'.format(i+1))
time.sleep(2)
try:
# 点击下一页，以更新当前页面中的评价内容
driver.find_element_by_xpath('//div[@id=\"comment-0\"]/div[13]/div/div/a[@class=\"ui-pager-next\"]')
time.sleep(1)
except Exception as e:
print(e)
# 退出浏览器
driver.quit()return html_list

def get_info(html_list):
""""使用bs库解析网页并提取数据"""
from bs4 import BeautifulSoup
total_info = []
for content in html_list:
soup = BeautifulSoup(content, 'html.parser')
try:
# 获取每一页的包含评价在内的10部分内容
datas = soup.select(r'#comment-0 > div.comment-item')
except Exception as e:
raise e
# 遍历每部分的内容
for data in datas:
try:
# 获取评论内容
comment_text = data.find_all('p', {'class': 'comment-con'})
# 判断是否有追加评论
if len(comment_text) == 2:
comment_append = comment_text[1].string
else:
comment_append = 'None '
except Exception as e:
raise e
# 评论时间
comment_time = data.select(r'div.comment-message > div.order-info > span')[1].get_text()
# 订单类型
order_type = data.select(r'div.comment-message > div.order-info > span')[0].get_text()
# 总数据
total_info.append([comment_time, order_type, comment_text[0].string, comment_append])return total_info

def save_info(total_info):
"""导入到excel中去"""
import pandas as pd
dataseris = pd.DataFrame(total_info,columns=['评论时间', '订单类型', '评论内容', '追加评论'])dataseris.to_excel('E:\\info.xlsx')

结果：