Python中for循环中列表切片问题

Python中for循环中列表切片问题

这个程序是抽取豆瓣top250页面所有电影相关信息(名称,分数,影评人数,引用语)。

问题是在parse_page函数中,top250共十个页面,成功提取前八页的信息,但最后两页的信息提取有问题提示list index out of range ,但此数据在for中有显示,for之外调用时就出错。求解。

import socket

import ssl

def log(*args, **kwargs):

print('log: ', *args, **kwargs)

def parse_url(url):

# 提取协议与uri

protocol = url.split('://')[0]

if protocol == 'http':

protocol = 'http'

uri = url.split('://')[1]

elif protocol == 'https':

protocol = 'https'

uri = url.split('://')[1]

else:

uri = url

# 提取主机地址

index = uri.find('/')

if index == -1:

host = uri

else:

host = uri.split('/')[0]

# 提取端口号

http_ports = {

'http': 80,

'https': 443,

}

if protocol in http_ports:

port = http_ports[protocol]

else:

port = uri.split(':')[1]

# 提取路径

if index == -1:

path = '/'

else:

path = '/' + uri.split('/')[1]

return protocol, host, port, path

def socket_by_protocol(protocol):

if protocol == 'http':

s = socket.socket()

elif protocol == 'https':

s = ssl.wrap_socket(socket.socket())

return s

def response_by_socket(s):

buffer_size = 1024

all_data = b''

while True:

response = s.recv(buffer_size)

if len(response) == 0:

break

all_data += response

return all_data.decode()

def parse_response(response):

errors = ''

if response:

header, body = response.split('\r\n\r\n', 1)

header_line = header.split('\r\n')

status_code = header_line[0].split()[1]

headers = {}

for line in header_line[1:]:

k, v = line.split(': ')

headers[k] = v

else:

errors = 'response is null value.'

headers = {}

body = ''

return status_code, headers, body

def construct_request(host, path):

request = 'GET {} HTTP/1.1\r\nhost: {}\r\nconnection: close\r\n\r\n'.format(path, host)

return request.encode()

def get(url, query):

protocol, host, port, path = parse_url(url)

s = socket_by_protocol(protocol)

s.connect((host, port))

cons_path = '{}?{}={}'.format(path, query[1], query[0])

request = construct_request(host, cons_path)

s.send(request)

response = response_by_socket(s)

status_code, header, body = parse_response(response)

return status_code, header, body

def parse_page(source=''):

mv_name = []

mv_score = []

mv_people = []

mv_quot = []

first_split = str(source.split('<ol class="grid_view">').pop(1))

second_split = str(first_split.split('</ol>').pop(0))

third_split = second_split.split('<div class="info">')

del third_split[0]

for line in third_split:

line = line.split('</li>')

del line[1]

# 名称抽取

raw_single_mv_name = line[0].split('</a>')[0].split('<span class="title">')[1]

single_mv_name = raw_single_mv_name.split('</span>')[0]

mv_name.append(single_mv_name)

# 分数与评价人数抽取

raw_single_mv_evaluate = line[0].split('<div class="star">')[1].split('</span>')

single_mv_score = raw_single_mv_evaluate[1].split('">')[1]

mv_score.append(single_mv_score)

single_mv_people = raw_single_mv_evaluate[3].split('<span>')[1]

mv_people.append(single_mv_people)

# 引用语抽取

# log(mv_name, mv_score, mv_people, line[0])

# log(line[0].split('<span class="inq">')[1])

raw_singe_mv_quot = line[0].split('<span class="inq">')[1]

# log(raw_singe_mv_quot)

single_mv_quot = raw_singe_mv_quot.split('</span>')[0]

#log(single_mv_quot)

mv_quot.append(single_mv_quot)

# 此处mv_quot有值

log(mv_quot)

#为何这里mv_quot提示list index out of range

log(mv_quot)

# log(len(mv_name), len(mv_score), len(mv_people), len(mv_quot))

return mv_name, mv_score, mv_people, mv_quot

def main():

url = "https://movie.douban.com/top250"

protocol, host, port, path = parse_url(url)

log(protocol, host, port, path)

queries = {}

for v in [value for value in range(250, 0, -25)]:

queries[v] = 'start'

log(queries)

i = 0

for q in queries.items():

try:

status_code, header, body = get(url, q)

"""

if i == 8:

log(status_code, header, body)

"""

mvo_name, mvo_score, mvo_people, mvo_quot = parse_page(source=body)

# log(mvo_name)

# log(mvo_score)

# log(mvo_people)

log(mvo_quot)

i += 1

except Exception as e:

log(e)

continue

if __name__ == '__main__':

main()


回答:

这行代码有问题

raw_singe_mv_quot = line[0].split('<span class="inq">')[1]

拆开解释

tmp_list = line[0].split('<span class="inq">')

raw_singe_mv_quot = tmp_list[1]

tmp_list 这个列表的长度可能为 1,所以 tmp_list[1] 会报错误。

具体逻辑我也没看,你自己排查吧!


回答:

log: list index out of range是被main函数这一行打印出来的

python3">        except Exception as e:

log(e) # note!

continue

把你写的try...catch去掉,解释器给你的信息已足够

Traceback (most recent call last):

File "test_dou.py", line 173, in <module>

main()

File "test_dou.py", line 162, in main

mvo_name, mvo_score, mvo_people, mvo_quot = parse_page(source=body)

File "test_dou.py", line 134, in parse_page

raw_singe_mv_quot = line[0].split('<span class="inq">')[1]

IndexError: list index out of range


回答:

这里有个例子

import requests

import random

from lxml import etree

from bs4 import BeautifulSoup

url = "https://movie.douban.com/top250?start={}&filter="

with open('douban.txt', 'w', encoding='utf-8') as f:

for num in range(0, 5, 25):

url_next = url.format(num)

pro = ['122.152.196.126', '114.215.174.227', '119.185.30.75']

head = {

'user-Agent':

'Mozilla/5.0(Windows NT 10.0;Win64 x64)AppleWebkit/537.36(KHTML,like Gecko) chrome/58.0.3029.110 Safari/537.36'

}

response = requests.get(url_next,

proxies={'http': random.choice(pro)},

headers=head,

timeout=3)

soup = BeautifulSoup(response.text, 'lxml')

names = soup.select('div.hd > a')

levels = soup.select('span.rating_num')

for name, level in zip(names, levels):

name = name.get_text().split('/')[0].split('\n')[1]

level = level.get_text()

f.write(name + level + '\n')

print('保存成功')

以上是 Python中for循环中列表切片问题 的全部内容, 来源链接: utcz.com/a/158446.html

回到顶部