Python程序中变量作用范围应该如何理解?

Python程序中变量作用范围应该如何理解?

初学Python,自己尝试着写了一个爬虫,主要代码如下

import json

import scrapy

from bs4 import BeautifulSoup

from docx import Document

from docx.shared import Pt

class BidsSpider(scrapy.Spider):

name = 'bids_spider'

allowed_domains = [

'deal.ggzy.gov.cn',

'www.ggzy.gov.cn',

]

start_urls = [

'http://deal.ggzy.gov.cn',

'http://www.ggzy.gov.cn',

]

custom_settings = {

"DOWNLOAD_DELAY": 1,

"RETRY_ENABLED": True,

}

page = 1

url = 'http://deal.ggzy.gov.cn/ds/deal/dealList_find.jsp?TIMEBEGIN_SHOW=2020-09-01&TIMEEND_SHOW=2020-10-20&TIMEBEGIN=2020-09-01&TIMEEND=2020-10-20&SOURCE_TYPE=1&DEAL_TIME=06&DEAL_CLASSIFY=01&DEAL_STAGE=0101&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&FINDTXT=风电&PAGENUMBER='

doc = Document()

def start_requests(self):

yield scrapy.Request(

url=self.url + str(self.page),

callback=self.parse,

method='GET',

)

def parse(self, response):

json_data = json.loads(response.text)

bid_list = json_data['data']

if not len(bid_list):

self.doc.save('D:/projects/test.docx')

return

for index, value in enumerate(bid_list):

item = dict()

item['title'] = value['title']

item['platformName'] = value['platformName']

item['districtShow'] = value['districtShow']

item['tradeShow'] = value['tradeShow']

item['timeShow'] = value['timeShow']

yield scrapy.Request(

url=value['url'],

callback=self.parse_detail,

meta={'item': item},

method='GET',

)

self.page += 1

yield scrapy.Request(

url=self.url + str(self.page),

callback=self.parse,

method='GET',

)

def parse_detail(self, response):

item = response.meta['item']

url = response.css('li.li_hover a::attr(onclick)').extract_first()

yield scrapy.Request(

url='http://www.ggzy.gov.cn/information' + url[25: -2],

callback=self.parse_text,

meta={'item': item},

method='GET',

)

def parse_text(self, response):

item = response.meta['item']

html = response.xpath('//div[@class="detail_content"]').extract_first()

soup = BeautifulSoup(str(html), 'html.parser')

item['detail'] = soup.get_text()

p = self.doc.add_paragraph()

r = p.add_run(item['title'])

r.font.name = '黑体'

r.bold = True

p = self.doc.add_paragraph()

r = p.add_run('来源平台:')

r.bold = True

p.add_run(item['platformName'])

p = self.doc.add_paragraph()

r = p.add_run('省份:')

r.bold = True

p.add_run(item['districtShow'])

p = self.doc.add_paragraph()

r = p.add_run('行业:')

r.bold = True

p.add_run(item['tradeShow'])

p = self.doc.add_paragraph()

r = p.add_run('发布日期:')

r.bold = True

p.add_run(item['timeShow'])

p = self.doc.add_paragraph()

r = p.add_run(item['detail'])

r.font.size = Pt(8)

self.doc.add_page_break()

self.logger.info('#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#')

self.logger.info(item['title'])

self.logger.info('#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#')

出现的问题是self.doc.save行生成的文件是空的,但是最后的日志输出语句是有内容的,的确爬到了数据。我觉得是我对Python的变量作用范围不理解造成代码有问题,但是尝试了各种方法,还是没法解决。希望哪位大佬可以指导我一下,万分感谢。


回答:

数据的存储去Item Pipeline里处理,不要在这里。

以上是 Python程序中变量作用范围应该如何理解? 的全部内容, 来源链接: utcz.com/a/62289.html

回到顶部