爬虫ConnectionError MaxRetryError怎么解决?

试了两种代码,找了无数解决方案。。。
也解决不了
错误如下
爬虫ConnectionError MaxRetryError怎么解决?
爬虫ConnectionError MaxRetryError怎么解决?
我的代码

import pandas as pd

import random

from time import sleep

import requests

save_path='D:/年报'

download_path='http://static.cninfo.com.cn/'

User_Agent=['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29']

headers={

'Host':'www.cninfo.com.cn',

'Origin': 'http://www.cninfo.com.cn',

'Accept-Encoding': 'gzip, deflate',

'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

'Accept': 'application/json, text/javascript,*/*;q=0.01',

'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',

'origin':'www.cninfo.com.cn',

'referer':'www.cninfo.com.cn/new/commonurl?url=disloure/list/notice',

'X-Requested-With': 'XMLHttpRequest',

}

def get_orgid(namelist):

orglist=[]

url = "http://www.cninfo.com./new/information/topSearch/detailOfQuery"

hd={

'Host':'www.cninfo.com.cn',

'Origin': 'http://www.cninfo.com.cn',

'Pragma':'no-cache',

'Accept-Encoding': 'gzip,deflate',

'Connection': 'close',

'Content -Length': '70',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29',

'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

'Accept': 'application/json, text/javascript,*/*',

'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',

}

for name in namelist:

data = {

'keyWord':name,

'maxSecNum': 10,

'maxListNum': 5,

}

r = requests.post(url,headers=hd,data=data)

org_id=r.json()["keyBoardList"][0]["orgId"]

orglist.append(org_id)

formatlist=list(set(orglist))

formatlist.sort(key=orglist.index)

return formatlist

def single_page(stock):

query_path = "http://www.cninfo.com.cn/new/hisAnnouncement/query"

headers['User-Agent']=random.choice(User_Agent)

print(stock)

query = {

'stock' : stock,

'tabName ' : 'fulltext',

'pageSize': 30,

'pageNum': 1,

'column': 'szse',

'category': 'category_ndbg_szsh; ',

'plate': '',

'seDate': '',

'trade':'',

'searchkey ': '',

'secid': '',

'sortName': '',

'sortType':'',

'isHLtitle': 'true',

}

namelist=request.post(query_path,headers=headers,data=query)

single_page=namelist.json()['announcements']

print(len(single_page))

return single_page

def saving(single_page):

headers={

'Host':'www.cninfo.com.cn',

'Connection': 'close',

'upgrade-insecure-requests':'1',

'Accept-Encoding': 'gzip, deflate',

'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q-0.7,en-US;q=0.6',

'referer':'www.cninfo.com.cn/new/commonurl?url=disloure/list/notice',

'Cookie':'routeID=ucl',

}

for i in single_page:

if "摘要"in i['announcementTitle'] or "公告" in i['announcementTitle']:

continue

elif "年年度报告"in i['announcementTitle']:

download = download_path + i["adjunctUr1"]

file_path = saving_path +'/'+name

print(file_path)

time.sleep(random.random()* 2)

headers[ 'User-Agent']= random.choice(User_Agent)

r = requests.get(download,headers=headers)

time.sleep(10)

print(r.status_code)

f= open(file_path,"wb")

f.write(r.content)

f.close()

if __name__== '__main__':

Sec = pd.read_excel('C:/Users/dell/Desktop/01.xlsx',dtype={'code':'object'})

Seclist = list(Sec['code'])

Namelist = list(Sec['name'])

org_list= get_orgid(Namelist)

Sec['orgid'] = org_list

Sec.to_excel('C:/Users/dell/Desktop/01.xlsx',sheet_name='sheet-2',index=False)

stock = ''

count = 0

for rows in Sec.iterrows():

stock = str(rows[1]['code'])+','+str(rows[1]['orgid'])+':'

try:

page_data = single_page(stock)

except:

print('page error')

saving(page_data)

count = count+1

print('计数',count)

实在不懂,求帮助!!!

补充

gaierror                                  Traceback (most recent call last)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py:174, in HTTPConnection._new_conn(self)

173 try:

--> 174 conn = connection.create_connection(

175 (self._dns_host, self.port), self.timeout, **extra_kw

176 )

178 except SocketTimeout:

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\util\connection.py:72, in create_connection(address, timeout, source_address, socket_options)

68 return six.raise_from(

69 LocationParseError(u"'%s', label empty or too long" % host), None

70 )

---> 72 for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):

73 af, socktype, proto, canonname, sa = res

File ~\AppData\Local\Programs\Python\Python310\lib\socket.py:955, in getaddrinfo(host, port, family, type, proto, flags)

954 addrlist = []

--> 955 for res in _socket.getaddrinfo(host, port, family, type, proto, flags):

956 af, socktype, proto, canonname, sa = res

gaierror: [Errno 11001] getaddrinfo failed

During handling of the above exception, another exception occurred:

NewConnectionError Traceback (most recent call last)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py:703, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)

702 # Make the request on the httplib connection object.

--> 703 httplib_response = self._make_request(

704 conn,

705 method,

706 url,

707 timeout=timeout_obj,

708 body=body,

709 headers=headers,

710 chunked=chunked,

711 )

713 # If we're going to release the connection in ``finally:``, then

714 # the response doesn't need to know about the connection. Otherwise

715 # it will also try to release it and we'll have a double-release

716 # mess.

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py:398, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)

397 else:

--> 398 conn.request(method, url, **httplib_request_kw)

400 # We are swallowing BrokenPipeError (errno.EPIPE) since the server is

401 # legitimately able to close the connection after sending a valid response.

402 # With this behaviour, the received response is still readable.

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py:239, in HTTPConnection.request(self, method, url, body, headers)

238 headers["User-Agent"] = _get_default_user_agent()

--> 239 super(HTTPConnection, self).request(method, url, body=body, headers=headers)

File ~\AppData\Local\Programs\Python\Python310\lib\http\client.py:1282, in HTTPConnection.request(self, method, url, body, headers, encode_chunked)

1281 """Send a complete request to the server."""

-> 1282 self._send_request(method, url, body, headers, encode_chunked)

File ~\AppData\Local\Programs\Python\Python310\lib\http\client.py:1328, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked)

1327 body = _encode(body, 'body')

-> 1328 self.endheaders(body, encode_chunked=encode_chunked)

File ~\AppData\Local\Programs\Python\Python310\lib\http\client.py:1277, in HTTPConnection.endheaders(self, message_body, encode_chunked)

1276 raise CannotSendHeader()

-> 1277 self._send_output(message_body, encode_chunked=encode_chunked)

File ~\AppData\Local\Programs\Python\Python310\lib\http\client.py:1037, in HTTPConnection._send_output(self, message_body, encode_chunked)

1036 del self._buffer[:]

-> 1037 self.send(msg)

1039 if message_body is not None:

1040

1041 # create a consistent interface to message_body

File ~\AppData\Local\Programs\Python\Python310\lib\http\client.py:975, in HTTPConnection.send(self, data)

974 if self.auto_open:

--> 975 self.connect()

976 else:

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py:205, in HTTPConnection.connect(self)

204 def connect(self):

--> 205 conn = self._new_conn()

206 self._prepare_conn(conn)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py:186, in HTTPConnection._new_conn(self)

185 except SocketError as e:

--> 186 raise NewConnectionError(

187 self, "Failed to establish a new connection: %s" % e

188 )

190 return conn

NewConnectionError: <urllib3.connection.HTTPConnection object at 0x0000025D9346B1F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed

During handling of the above exception, another exception occurred:

MaxRetryError Traceback (most recent call last)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\adapters.py:440, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)

439 if not chunked:

--> 440 resp = conn.urlopen(

441 method=request.method,

442 url=url,

443 body=request.body,

444 headers=request.headers,

445 redirect=False,

446 assert_same_host=False,

447 preload_content=False,

448 decode_content=False,

449 retries=self.max_retries,

450 timeout=timeout

451 )

453 # Send the request.

454 else:

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py:785, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)

783 e = ProtocolError("Connection aborted.", e)

--> 785 retries = retries.increment(

786 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]

787 )

788 retries.sleep()

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\util\retry.py:592, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)

591 if new_retry.is_exhausted():

--> 592 raise MaxRetryError(_pool, url, error or ResponseError(cause))

594 log.debug("Incremented Retry for (url='%s'): %r", url, new_retry)

MaxRetryError: HTTPConnectionPool(host='www.cninfo.com.', port=80): Max retries exceeded with url: /new/information/topSearch/detailOfQuery (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000025D9346B1F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

During handling of the above exception, another exception occurred:

ConnectionError Traceback (most recent call last)

Input In [2], in <cell line: 103>()

105 Seclist = list(Sec['code'])

106 Namelist = list(Sec['name'])

--> 107 org_list= get_orgid(Namelist)

108 Sec['orgid'] = org_list

110 Sec.to_excel('C:/Users/dell/Desktop/01.xlsx',sheet_name='sheet-2',index=False)

Input In [2], in get_orgid(namelist)

37 for name in namelist:

38 data = {

39 'keyWord':name,

40 'maxSecNum': 10,

41 'maxListNum': 5,

42 }

---> 43 r = requests.post(url,headers=hd,data=data)

44 org_id=r.json()["keyBoardList"][0]["orgId"]

45 orglist.append(org_id)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\api.py:117, in post(url, data, json, **kwargs)

105 def post(url, data=None, json=None, **kwargs):

106 r"""Sends a POST request.

107

108 :param url: URL for the new :class:`Request` object.

(...)

114 :rtype: requests.Response

115 """

--> 117 return request('post', url, data=data, json=json, **kwargs)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\api.py:61, in request(method, url, **kwargs)

57 # By using the 'with' statement we are sure the session is closed, thus we

58 # avoid leaving sockets open which can trigger a ResourceWarning in some

59 # cases, and look like a memory leak in others.

60 with sessions.Session() as session:

---> 61 return session.request(method=method, url=url, **kwargs)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\sessions.py:529, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)

524 send_kwargs = {

525 'timeout': timeout,

526 'allow_redirects': allow_redirects,

527 }

528 send_kwargs.update(settings)

--> 529 resp = self.send(prep, **send_kwargs)

531 return resp

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\sessions.py:645, in Session.send(self, request, **kwargs)

642 start = preferred_clock()

644 # Send the request

--> 645 r = adapter.send(request, **kwargs)

647 # Total elapsed time of the request (approximately)

648 elapsed = preferred_clock() - start

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\adapters.py:519, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)

515 if isinstance(e.reason, _SSLError):

516 # This branch is for urllib3 v1.22 and later.

517 raise SSLError(e, request=request)

--> 519 raise ConnectionError(e, request=request)

521 except ClosedPoolError as e:

522 raise ConnectionError(e, request=request)

ConnectionError: HTTPConnectionPool(host='www.cninfo.com.', port=80): Max retries exceeded with url: /new/information/topSearch/detailOfQuery (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000025D9346B1F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))


回答:

首先,你截图的报错信息没有截到最关键的地方(哪一行出错),一般来说拍错首先要定位到错误出现在哪里,才能准确判断错误。

ConnectionError MaxRetryError 通常出现在网络请求失败的时候,无非是网络不可达或者目标网站有反爬机制。

网络不可达

  • 检查目标网站能否访问
  • 检查 URL 能否访问

反爬机制

  • 通过变换 UA 尝试能否访问
  • 判断是否存在其他反爬机制

调试爬虫之前要确保你的 HTTP 请求在浏览器中能接收到响应。

以上是 爬虫ConnectionError MaxRetryError怎么解决? 的全部内容, 来源链接: utcz.com/p/938406.html

回到顶部