Python Web Scrape Cycle选项卡

寻求帮助,以循环访问网站上的所有选项卡以捕获所有相关信息。Python Web Scrape Cycle选项卡

在以下站点中,有几个标签分别标记为5x5,5x10x5,10x10等。我不确定如何构造它,以便它会通过选项卡并在我的脚本中编写循环。感谢您的帮助。

下面是python脚本;

from urllib.request import urlopen as uReq 

from bs4 import BeautifulSoup as soup

import csv

urls = [

'https://www.lifestorage.com/storage-units/florida/orlando/32810/610-near-lockhart/?size=5x5'

]

filename = 'life_storage.csv'

f = open(filename, 'a+')

csv_writer = csv.writer(f)

headers = ['unit_size', 'unit_type', 'description', 'online_price', 'reg_price', 'store_address', 'store_city', 'store_state', 'store_postalcode' ]

##unit_size = 5'x10' withouth the '

##unit_type = climate controlled or not (this could be blank if non-climate)

##descirption = the level it's on and type of access.

##online_price = $##/mo text

##reg_price = the scratched off $## text

csv_writer.writerow(headers)

for my_url in urls:

uClient = uReq(my_url)

page_html = uClient.read()

uClient.close()

page_soup = soup(page_html, 'html.parser')

store_locator = page_soup.findAll("div", {"itemprop": "address"})

containers = page_soup.findAll("ul", {"id": "spaceList"})

for container in containers:

for store_location in store_locator:

store_address1 = store_location.find("span", {"itemprop": "streetAddress"})

store_address = store_address1.text

store_city1 = store_location.find("span", {"itemprop": "addressLocality"})

store_city = store_city1.text

store_state1 = store_location.find("span", {"itemprop": "addressRegion"})

store_state = store_state1.text

store_postalcode1 = store_location.find("span", {"itemprop": "postalCode"})

store_postalcode = store_postalcode1.text

title_container = container.find("div", {"class": "storesRow"})

unit_size = title_container.text

unit_container = container.find("div", {"class": "storesRow"})

unit_type = unit_container.span.text

description_container = container.find("ul", {"class": "features"})

description = description_container.text

online_price_container = container.find("div", {"class": "priceBox"})

online_price = online_price_container.span.text

reg_price_container = container.find("div", {"class": "priceBox"})

reg_price = reg_price_container.i.text

csv_writer.writerow([unit_size, unit_type, description, online_price, reg_price, store_address, store_city, store_state, store_postalcode])

f.close()

下面是与循环相关的html正文的片段;

//////////\\\\\\\Description BOX  

<div class="storesRow">

<span>

<a href="/reservation/choose/?store=610&amp;type=1"> 5' x 5'<sup>*</sup> - Climate Controlled </a>

</span>

<ul class="features">

<li>Indoor access</li>

<li>Ground Level</li>

</ul>

</div>

//////////\\\\\\\\\PRICE BOX

<div class="priceBox">

<span>

$25/mo

<i> $27</i>

</span>

<em class="pOnly ">Phone &amp; online only</em>

<div class="specialsMessage">

</div>

</div>

//////////\\\\\\\\\ADDRESS BOX

<div itemprop="address" itemscope="" itemtype="https://schema.org/PostalAddress">

<em>

<i class="fa fa-map-marker"></i>

<span itemprop="streetAddress">7244 Overland Rd </span>

<span itemprop="addressLocality">Orlando</span>,

<span itemprop="addressRegion">FL</span>

<span itemprop="postalCode">32810</span>

</em>

</div>

电流输出

所需的输出

回答:

你错了凹痕 - writerow()应该是内for内。

但它可能需要更多工作才能从项目中挤出正确的文本。请参阅代码。

from urllib.request import urlopen as uReq 

from bs4 import BeautifulSoup as soup

import csv

urls = [

'https://www.lifestorage.com/storage-units/florida/orlando/32810/610-near-lockhart/?size=5x5'

]

filename = 'life_storage.csv'

f = open(filename, 'a+')

csv_writer = csv.writer(f)

headers = ['unit_size', 'unit_type', 'description', 'online_price', 'reg_price', 'store_address', 'store_city', 'store_state', 'store_postalcode' ]

##unit_size = 5'x10' withouth the '

##unit_type = climate controlled or not (this could be blank if non-climate)

##descirption = the level it's on and type of access.

##online_price = $##/mo text

##reg_price = the scratched off $## text

csv_writer.writerow(headers)

for my_url in urls:

uClient = uReq(my_url)

page_html = uClient.read()

uClient.close()

page_soup = soup(page_html, 'html.parser')

store_location = page_soup.find("div", {"itemprop": "address"})

# need `li`

containers = page_soup.find("ul", {"id": "spaceList"}).findAll('li')

print('len(containers):', len(containers))

item = store_location.find("span", {"itemprop": "streetAddress"})

store_address = item.text.strip()

item = store_location.find("span", {"itemprop": "addressLocality"})

store_city = item.text.strip()

item = store_location.find("span", {"itemprop": "addressRegion"})

store_state = item.text.strip()

item = store_location.find("span", {"itemprop": "postalCode"})

store_postalcode = item.text.strip()

for container in containers:

item = container.find("div", {"class": "storesRow"})

if item and item.span:

text = item.span.text.strip()

parts = text.split('-')

if len(parts) > 0:

unit_size = parts[0].strip().replace('*', "")

else:

unit_size = ''

if len(parts) > 1:

unit_type = parts[1].strip()

else:

unit_type = ''

else:

continue

item = container.find("ul", {"class": "features"})

if item:

description = item.text.strip().replace("\n", ',')

else:

description = ''

item = container.find("div", {"class": "priceBox"})

if item and item.i:

reg_price = item.i.text.strip()

else:

reg_price = ''

if item and item.span:

if item.i:

item.i.extract() # remove <i>`

online_price = item.span.text.strip()

else:

online_price = ''

csv_writer.writerow([unit_size, unit_type, description, online_price, reg_price, store_address, store_city, store_state, store_postalcode])

f.close()

结果:

unit_size,unit_type,description,online_price,reg_price,store_address,store_city,store_state,store_postalcode 

5' x 5',Climate Controlled,"Indoor access,Ground Level",$25/mo,$27,7244 Overland Rd,Orlando,FL,32810

5' x 5',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810

5' x 10',,"Outdoor/Drive-up access,Ground Level",$46/mo,$50,7244 Overland Rd,Orlando,FL,32810

10' x 5',Climate Controlled,"Indoor access,Ground Level",$57/mo,$62,7244 Overland Rd,Orlando,FL,32810

5' x 10',Climate Controlled,"Indoor access,Ground Level",$67/mo,$73,7244 Overland Rd,Orlando,FL,32810

5' x 10',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810

5' x 15',Climate Controlled,"Indoor access,Ground Level",$69/mo,$75,7244 Overland Rd,Orlando,FL,32810

10' x 10',,"Outdoor/Drive-up access,Ground Level",$105/mo,$115,7244 Overland Rd,Orlando,FL,32810

10' x 10',Climate Controlled,"Indoor access,Ground Level",$105/mo,$115,7244 Overland Rd,Orlando,FL,32810

10' x 10',Climate Controlled,"Indoor access,Ground Level",$124/mo,$136,7244 Overland Rd,Orlando,FL,32810

10' x 15',,"Outdoor/Drive-up access,Ground Level",$144/mo,$158,7244 Overland Rd,Orlando,FL,32810

10' x 16',,"Outdoor/Drive-up access,Ground Level",$145/mo,$159,7244 Overland Rd,Orlando,FL,32810

10' x 15',Climate Controlled,"Indoor access,Ground Level",$149/mo,$163,7244 Overland Rd,Orlando,FL,32810

10' x 18',,"Outdoor/Drive-up access,Ground Level",$149/mo,$163,7244 Overland Rd,Orlando,FL,32810

10' x 15',Climate Controlled,"Indoor access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810

10' x 20',,"Outdoor/Drive-up access,Ground Level",$147/mo,$161,7244 Overland Rd,Orlando,FL,32810

10' x 25',Climate Controlled,"Indoor access,Ground Level",$175/mo,$192,7244 Overland Rd,Orlando,FL,32810

10' x 20',Climate Controlled,"Indoor access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810

10' x 28',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810

41' x 41',,"Outdoor/Drive-up access,Ground Level",$1400/mo,$1540,7244 Overland Rd,Orlando,FL,32810

22' x 25',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810

18' x 38',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810

以上是 Python Web Scrape Cycle选项卡 的全部内容, 来源链接: utcz.com/qa/266827.html

回到顶部