Python中for循环中列表切片问题

Z时代
2024-01-10
分类：技术分享

Python中for循环中列表切片问题

这个程序是抽取豆瓣top250页面所有电影相关信息（名称，分数，影评人数，引用语）。

问题是在parse_page函数中，top250共十个页面，成功提取前八页的信息，但最后两页的信息提取有问题提示list index out of range ，但此数据在for中有显示，for之外调用时就出错。求解。

import socket
import ssl
def log(*args, **kwargs):
    print('log: ', *args, **kwargs)
def parse_url(url):
    # 提取协议与uri
    protocol = url.split('://')[0]
    if protocol == 'http':
        protocol = 'http'
        uri = url.split('://')[1]
    elif protocol == 'https':
        protocol = 'https'
        uri = url.split('://')[1]
    else:
        uri = url
    # 提取主机地址
    index = uri.find('/')
    if index == -1:
        host = uri
    else:
        host = uri.split('/')[0]
    # 提取端口号
    http_ports = {
        'http': 80,
        'https': 443,
    }
    if protocol in http_ports:
        port = http_ports[protocol]
    else:
        port = uri.split(':')[1]
    # 提取路径
    if index == -1:
        path = '/'
    else:
        path = '/' + uri.split('/')[1]
    return protocol, host, port, path
def socket_by_protocol(protocol):
    if protocol == 'http':
        s = socket.socket()
    elif protocol == 'https':
        s = ssl.wrap_socket(socket.socket())
    return s
def response_by_socket(s):
    buffer_size = 1024
    all_data = b''
    while True:
        response = s.recv(buffer_size)
        if len(response) == 0:
            break
        all_data += response
    return all_data.decode()
def parse_response(response):
    errors = ''
    if response:
        header, body = response.split('\r\n\r\n', 1)
        header_line = header.split('\r\n')
        status_code = header_line[0].split()[1]
        headers = {}
        for line in header_line[1:]:
            k, v = line.split(': ')
            headers[k] = v
    else:
        errors = 'response is null value.'
        headers = {}
        body = ''
    return status_code, headers, body
def construct_request(host, path):
    request = 'GET {} HTTP/1.1\r\nhost: {}\r\nconnection: close\r\n\r\n'.format(path, host)
    return request.encode()
def get(url, query):
    protocol, host, port, path = parse_url(url)
    s = socket_by_protocol(protocol)
    s.connect((host, port))
    cons_path = '{}?{}={}'.format(path, query[1], query[0])
    request = construct_request(host, cons_path)
    s.send(request)
    response = response_by_socket(s)
    status_code, header, body = parse_response(response)
    return status_code, header, body
def parse_page(source=''):
    mv_name = []
    mv_score = []
    mv_people = []
    mv_quot = []
    first_split = str(source.split('<ol class="grid_view">').pop(1))
    second_split = str(first_split.split('</ol>').pop(0))
    third_split = second_split.split('<div class="info">')
    del third_split[0]
    for line in third_split:
        line = line.split('</li>')
        del line[1]
        # 名称抽取
        raw_single_mv_name = line[0].split('</a>')[0].split('<span class="title">')[1]
        single_mv_name = raw_single_mv_name.split('</span>')[0]
        mv_name.append(single_mv_name)
        # 分数与评价人数抽取
        raw_single_mv_evaluate = line[0].split('<div class="star">')[1].split('</span>')
        single_mv_score = raw_single_mv_evaluate[1].split('">')[1]
        mv_score.append(single_mv_score)
        single_mv_people = raw_single_mv_evaluate[3].split('<span>')[1]
        mv_people.append(single_mv_people)
        # 引用语抽取
        # log(mv_name, mv_score, mv_people, line[0])
       # log(line[0].split('<span class="inq">')[1])
        raw_singe_mv_quot = line[0].split('<span class="inq">')[1]
       # log(raw_singe_mv_quot)
        single_mv_quot = raw_singe_mv_quot.split('</span>')[0]
        #log(single_mv_quot)
        mv_quot.append(single_mv_quot)
        # 此处mv_quot有值
        log(mv_quot)
    #为何这里mv_quot提示list index out of range
    log(mv_quot)
    # log(len(mv_name), len(mv_score), len(mv_people), len(mv_quot))
    return mv_name, mv_score, mv_people, mv_quot
def main():
    url = "https://movie.douban.com/top250"
    protocol, host, port, path = parse_url(url)
    log(protocol, host, port, path)
    queries = {}
    for v in [value for value in range(250, 0, -25)]:
        queries[v] = 'start'
    log(queries)
    i = 0
    for q in queries.items():
        try:
            status_code, header, body = get(url, q)
            """
            if i == 8:
                log(status_code, header, body)
            """
            mvo_name, mvo_score, mvo_people, mvo_quot = parse_page(source=body)
           # log(mvo_name)
           # log(mvo_score)
           # log(mvo_people)
            log(mvo_quot)
            i += 1
        except Exception as e:
            log(e)
            continue
if __name__ == '__main__':    main()

回答：

这行代码有问题

raw_singe_mv_quot = line[0].split('<span class="inq">')[1]

拆开解释

tmp_list = line[0].split('<span class="inq">')raw_singe_mv_quot = tmp_list[1]

tmp_list 这个列表的长度可能为 1，所以 tmp_list[1] 会报错误。

具体逻辑我也没看，你自己排查吧！

回答：

log: list index out of range是被main函数这一行打印出来的

python3">        except Exception as e:
            log(e)            # note！            continue

把你写的try...catch去掉，解释器给你的信息已足够

Traceback (most recent call last):
  File "test_dou.py", line 173, in <module>
    main()
  File "test_dou.py", line 162, in main
    mvo_name, mvo_score, mvo_people, mvo_quot = parse_page(source=body)
  File "test_dou.py", line 134, in parse_page
    raw_singe_mv_quot = line[0].split('<span class="inq">')[1]IndexError: list index out of range

回答：

这里有个例子

import requests
import random
from lxml import etree
from bs4 import BeautifulSoup
url = "https://movie.douban.com/top250?start={}&filter="
with open('douban.txt', 'w', encoding='utf-8') as f:
    for num in range(0, 5, 25):
        url_next = url.format(num)
        pro = ['122.152.196.126', '114.215.174.227', '119.185.30.75']
        head = {
            'user-Agent':
            'Mozilla/5.0(Windows NT 10.0;Win64 x64)AppleWebkit/537.36(KHTML,like Gecko) chrome/58.0.3029.110 Safari/537.36'
        }
        response = requests.get(url_next,
                                proxies={'http': random.choice(pro)},
                                headers=head,
                                timeout=3)
        soup = BeautifulSoup(response.text, 'lxml')
        names = soup.select('div.hd > a')
        levels = soup.select('span.rating_num')
        for name, level in zip(names, levels):
            name = name.get_text().split('/')[0].split('\n')[1]
            level = level.get_text()
            f.write(name + level + '\n')
        print('保存成功')

以上是 Python中for循环中列表切片问题的全部内容，来源链接： utcz.com/a/158446.html

Python中for循环中列表切片问题

回答：

回答：

回答：

其他人也看了：