Python爬虫爬取搜狐视频电影并存储到mysql数据库

python

代码:

  1 import time

2 import traceback

3 import requests

4 from lxml import etree

5 import re

6 from bs4 import BeautifulSoup

7 from lxml.html.diff import end_tag

8 import json

9 import pymysql

10 #连接数据库 获取游标

11 def get_conn():

12 """

13 :return: 连接,游标

14 """

15 # 创建连接

16 conn = pymysql.connect(host="127.0.0.1",

17 user="root",

18 password="000429",

19 db="movierankings",

20 charset="utf8")

21 # 创建游标

22 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示

23 if ((conn != None) & (cursor != None)):

24 print("数据库连接成功!游标创建成功!")

25 else:

26 print("数据库连接失败!")

27 return conn, cursor

28 #关闭数据库连接和游标

29 def close_conn(conn, cursor):

30 if cursor:

31 cursor.close()

32 if conn:

33 conn.close()

34 return 1

35

36 def get_souhu():

37 url=\'https://film.sohu.com/list_0_0_0_2_2_1_60.html?channeled=1200100000\'

38 #最新上架

39 new_url=\'https://film.sohu.com/list_0_0_0_2_1_1_60.html?channeled=1200100000\'

40 #本周热播

41 week_url=\'https://film.sohu.com/list_0_0_0_2_0_1_60.html?channeled=1200100000\'

42 headers={

43 \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36\'

44 }

45

46 #初始化list

47 templist=[]

48 dataRes=[]

49 #最受好评

50 for i in range(1,31):

51 url_1=\'https://film.sohu.com/list_0_0_0_2_2_\'

52 auto=str(i)

53 url_2=\'_60.html?channeled=1200100000\'

54 url=url_1+auto+url_2

55 response = requests.get(url, headers)

56 response.encoding = \'utf-8\'

57 page_text = response.text

58 # etree_ = etree.HTML(page_text)

59 # 获取所有的li

60 soup = BeautifulSoup(page_text, \'lxml\')

61 # 标签层级选择

62 li_list = soup.select(\'.movie-list>li\')

63 print(len(li_list))

64 if(len(li_list)==0):

65 print("最受好评爬取结束!")

66 if(len(dataRes)!=0):

67 return dataRes

68 for li in li_list:

69 li_text=str(li)

70 # print(li_text)

71 li_soup=BeautifulSoup(li_text,\'lxml\')

72 name=li_soup.find(\'div\',class_="v_name_info").text

73 #添加名字

74 templist.append(name)

75 # print(name)

76 #添加评分

77 score=li_soup.find(\'span\',class_=\'v_score\').text

78 #处理评分

79 score=score[-4:-1]

80 templist.append(score)

81 # print(score)

82 #添加path

83 path=li_soup.find(\'a\',target="_blank")[\'href\']

84 templist.append(path)

85 # print(path)

86 #添加播放状态

87 state="VIP"

88 templist.append(state)

89 print(templist)

90 dataRes.append(templist)

91 templist=[]

92 print("-------------------------------------------")

93 # print(len(dataRes))

94

95 # #最新上架

96 #

97 # templist = []

98 # for i in range(1,31):

99 # url_1=\'https://film.sohu.com/list_0_0_0_2_1_\'

100 # auto=str(i)

101 # url_2=\'_60.html?channeled=1200100000\'

102 # url=url_1+auto+url_2

103 # response = requests.get(url, headers)

104 # response.encoding = \'utf-8\'

105 # page_text = response.text

106 # # etree_ = etree.HTML(page_text)

107 # # 获取所有的li

108 # soup = BeautifulSoup(page_text, \'lxml\')

109 # # 标签层级选择

110 # li_list = soup.select(\'.movie-list>li\')

111 # print(len(li_list))

112 # if(len(li_list)==0):

113 # print("最新上架爬取结束!")

114 # if(len(dataRes)!=0):

115 # return dataRes

116 # for li in li_list:

117 # li_text=str(li)

118 # # print(li_text)

119 # li_soup=BeautifulSoup(li_text,\'lxml\')

120 # name=li_soup.find(\'div\',class_="v_name_info").text

121 # #添加名字

122 # templist.append(name)

123 # # print(name)

124 # #添加评分

125 # score=li_soup.find(\'span\',class_=\'v_score\').text

126 # #处理评分

127 # score=score[-4:-1]

128 # templist.append(score)

129 # # print(score)

130 # #添加path

131 # path=li_soup.find(\'a\',target="_blank")[\'href\']

132 # templist.append(path)

133 # # print(path)

134 # #添加播放状态

135 # state="VIP"

136 # templist.append(state)

137 # print(templist)

138 # dataRes.append(templist)

139 # templist=[]

140 # print("-------------------------------------------")

141 # # print(len(dataRes))

142 # #本周热播

143 # templist = []

144 # for i in range(1, 31):

145 # url_1 = \'https://film.sohu.com/list_0_0_0_2_0_\'

146 # auto = str(i)

147 # url_2 = \'_60.html?channeled=1200100000\'

148 # url = url_1 + auto + url_2

149 # response = requests.get(url, headers)

150 # response.encoding = \'utf-8\'

151 # page_text = response.text

152 # # etree_ = etree.HTML(page_text)

153 # # 获取所有的li

154 # soup = BeautifulSoup(page_text, \'lxml\')

155 # # 标签层级选择

156 # li_list = soup.select(\'.movie-list>li\')

157 # print(len(li_list))

158 # if (len(li_list) == 0):

159 # print("本周热播爬取结束!")

160 # if (len(dataRes) != 0):

161 # return dataRes

162 # for li in li_list:

163 # li_text = str(li)

164 # # print(li_text)

165 # li_soup = BeautifulSoup(li_text, \'lxml\')

166 # name = li_soup.find(\'div\', class_="v_name_info").text

167 # # 添加名字

168 # templist.append(name)

169 # # print(name)

170 # # 添加评分

171 # score = li_soup.find(\'span\', class_=\'v_score\').text

172 # # 处理评分

173 # score = score[-4:-1]

174 # templist.append(score)

175 # # print(score)

176 # # 添加path

177 # path = li_soup.find(\'a\', target="_blank")[\'href\']

178 # templist.append(path)

179 # # print(path)

180 # # 添加播放状态

181 # state = "VIP"

182 # templist.append(state)

183 # print(templist)

184 # dataRes.append(templist)

185 # templist = []

186 # print("-------------------------------------------")

187 # print(len(dataRes))

188 #list去重

189 # old_list = dataRes

190 # new_list = []

191 # for i in old_list:

192 # if i not in new_list:

193 # new_list.append(i)

194 # print(new_list) # [2, 3, 4, 5, 1]

195 return dataRes

196 #插入数据库

197 def insert_souhu():

198 cursor = None

199 conn = None

200 try:

201 count=0

202 list = get_souhu()

203 print(f"{time.asctime()}开始插入搜狐电影数据")

204 conn, cursor = get_conn()

205 sql = "insert into moviesohu (id,name,score,path,state) values(%s,%s,%s,%s,%s)"

206 for item in list:

207 print(item)

208 count = count + 1

209 #异常捕获,防止数据库主键冲突

210 try:

211 cursor.execute(sql, [0, item[0], item[1], item[2], item[3] ])

212 except pymysql.err.IntegrityError:

213 print("重复!跳过!")

214 conn.commit() # 提交事务 update delete insert操作

215 print(f"{time.asctime()}插入搜狐电影数据完毕")

216 except:

217 traceback.print_exc()

218 finally:

219 close_conn(conn, cursor)

220 return;

221

222 if __name__ == \'__main__\':

223 # get_iqy()

224 # get_souhu()

225 insert_souhu()

运行截图

数据库截图

 

建表语句

1 CREATE TABLE `moviesohu` (

2 `id` INT(11) NOT NULL AUTO_INCREMENT,

3 `name` VARCHAR(45) COLLATE utf8_bin NOT NULL,

4 `score` VARCHAR(45) COLLATE utf8_bin NOT NULL,

5 `path` VARCHAR(100) COLLATE utf8_bin NOT NULL,

6 `state` VARCHAR(10) COLLATE utf8_bin NOT NULL,

7 PRIMARY KEY (`name`),

8 KEY `id` (`id`)

9 ) ENGINE=INNODB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

以上是 Python爬虫爬取搜狐视频电影并存储到mysql数据库 的全部内容, 来源链接: utcz.com/z/389426.html

回到顶部