无法爬取某网站各项产品的url?

相关代码如图所示(第一段代码选自scraper,已用于第二段代码)

# function to get project url

def extract_project_url(df_input):

list_url = []

for ele in df_input["clickthrough_url"]:

list_url.append("https://www.indiegogo.com" + ele)

return list_url

import sys

sys.path.append("C:/Users/1/Desktop/indiegogo-scrapper-main/src/main.py")

import os

# load data analysis library

import pandas as pd

# load modules in scraper

from scraper import *

import multiprocessing

def main():

args = sys.argv[1:]

if os.path.exists("chromedriver\\chromedriver.exe") is False:

print("put chromedriver.exe into chromedriver directory.")

else:

if os.path.exists("data\\1.csv") is False:

print("put 1.csv into data directory.")

else:

if len(args) < 1:

print("define the json filename.")

elif args[0].find(".json") != -1:

dir_path_data = "data"

dir_path_output = "out/" + args[0]

filenames = next(os.walk(dir_path_data), (None, None, []))[2]

list_project_site = []

for ele in filenames:

df_indiegogo = pd.read_csv(dir_path_data + "\\" + ele, encoding="gbk", encoding_errors="ignore", sep='\t', on_bad_lines='skip')

list_project_site.extend(extract_project_url(df_indiegogo))

list_project_site = [[i, e] for i, e in enumerate(list_project_site)]

try:

f = open(dir_path_output, "r",)

data = json.loads(f.read())

f.close()

except Exception as e:

data = {}

list_processed = [e for e in list_project_site if e[1]

not in [data[key]["site"] for key in data]]

# process-based parallelism

# use one third of the available processors

processor = int(-1 * (multiprocessing.cpu_count() / 3) // 1 * -1)

# use one fourth of the available processors

# processor = int(multiprocessing.cpu_count()/4)

pool = multiprocessing.Pool(processes=processor)

print("*** start ***")

for b in [list_processed[i:i + processor] for i in range(0, len(list_processed), processor)]:

dict_tmp = {}

list_bres = pool.map(scrapes, b)

for i in list_bres:

dict_tmp.update(i)

if len(data) < 1:

with open(dir_path_output, 'w') as file:

json.dump(dict_tmp, file, indent=4)

else:

with open(dir_path_output, "r+") as file:

old_data = json.load(file)

old_data.update(dict_tmp)

file.seek(0)

json.dump(old_data, file, indent=4)

print("scraped", str(b[-1][0] + 1), "of", str(len(list_project_site) - 1))

break

else:

print("wrong output file extension. use json extension.")

print("*** end ***")

if __name__ == '__main__':

main()

结果如下
无法爬取某网站各项产品的url?
听从网上建议修改后代码和结果

# function to get project url

def extract_project_url(df_input):

list_url = []

for ele in df_input[["clickthrough_url"]]:

list_url.append("https://www.indiegogo.com" + ele)

return list_url

无法爬取某网站各项产品的url?
求助各位大佬此修改是否正确,错误的原因是否跟设置cookie有关,以及后续应该如何编程。如果觉得麻烦,请给予相关连接让萌新自行学习,十分感谢!

以上是 无法爬取某网站各项产品的url? 的全部内容, 来源链接: utcz.com/p/938845.html

回到顶部