python爬取今日头条图片

python

import requests

from urllib.parse import urlencode

from requests import codes

import os

# qianxiao996精心制作

#博客地址:https://blog.csdn.net/qq_36374896

from hashlib import md5

from multiprocessing.pool import Pool

def get_page(offset):

params = {

'offset': offset,

'format': 'json',

'keyword': '街拍',

'autoload': 'true',

'count': '20',

'cur_tab': '1',

'from': 'search_tab'

}

base_url = 'https://www.toutiao.com/search_content/?'

url = base_url + urlencode(params)

try:

resp = requests.get(url)

if codes.ok == resp.status_code:

return resp.json()

except requests.ConnectionError:

return None

def get_images(json):

if json.get('data'):

data = json.get('data')

for item in data:

if item.get('cell_type') is not None:

continue

title = item.get('title')

images = item.get('image_list')

# url_temp = image.get('url')

for image in images:

url_temp = image.get('url')

yield {

#此处更新,使用列表网址后23位作为大图的地址,抓取大图

'image': 'https:' + '//p3.pstatp.com/large/pgc-image/'+ url_temp[-23:],

# 'image': 'https:' + image.get('url'),

'title': title

}

def save_image(item):

img_path = 'img' + os.path.sep + item.get('title')

if not os.path.exists(img_path):

os.makedirs(img_path)

try:

resp = requests.get(item.get('image'))

if codes.ok == resp.status_code:

file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(

file_name=md5(resp.content).hexdigest(),

file_suffix='jpg')

if not os.path.exists(file_path):

with open(file_path, 'wb') as f:

f.write(resp.content)

print('Downloaded image path is %s' % file_path)

else:

print('Already Downloaded', file_path)

except requests.ConnectionError:

print('Failed to Save Image,item %s' % item)

def main(offset):

json = get_page(offset)

for item in get_images(json):

print(item)

save_image(item)

GROUP_START = 0

GROUP_END = 3

if __name__ == '__main__':

pool = Pool()

groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])

pool.map(main, groups)

pool.close()

pool.join()

代码年代久远,注释当时没写,现在懒得写了

以上是 python爬取今日头条图片 的全部内容, 来源链接: utcz.com/z/388437.html

回到顶部