如何在Javascript中添加服务器端延迟for循环?

我在摆弄使用Node.js从电子商务网站刮取数据。我使用Request来检索页面的DOM,并使用Cheerio来执行服务器端DOM选择。如何在Javascript中添加服务器端延迟for循环?

const cheerio = require('cheerio'); 

const request = require('request');

// takes a URL, scrapes the page, and returns an object with the data

let scrapePage = (url) => {

return new Promise((resolve, reject) => {

request(url, (error, resp, body) => {

if(error){

reject(error);

};

let $ = cheerio.load(body);

let $url = url;

let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text();

let obj = {

url: $url,

price: $price

}

resolve(obj);

});

});

};

// Runs scrapePage in a loop

// There is a variable called arrayOfURLs defined elsewhere that contains 100s of URLs

for(let i = 0; i < arrayOfURLs.length; i++){

scrapePage(arrayOfURLs[i])

.then((obj) => {

//write to a file

})

.catch((error) => {

})

};

的问题是,我将请求发送到服务器有时会发回空白数据,我假设,因为我发送太多的请求,没有任何形式的停顿。由于JS的异步特性,我很难弄清楚如何在循环的每次迭代之间添加有效的延迟。仅仅以同步的方式添加setTimeOut是不够的,因为setTimeOut本身是异步的,并且我在服务器上运行它,所以没有Window对象。

编辑

上面的代码是什么我工作的简化版本。整个代码是这样的:

app.js

const fs = require('fs'); 

const path = 'urls.txt';

const path2 = 'results.txt';

const scraper = require('./scraper');

let scrapePage = (url) => {

scraper.scrapePage(url)

.then((obj) => {

// console.log('obj from the scraper with Promises was received');

// console.log(obj);

// console.log('writing obj to a file');

fs.appendFile(path2, JSON.stringify(obj) + ', ', (error) => {

if(error){

console.log(error);

} else {

// console.log('Successfully wrote to ' + path2);

}

})

})

.catch((error) => {

console.log('There was an error scraping obj: ');

console.log(error);

})

}

fs.readFile(path, 'utf8', (err, data) => {

if (err){

throw err;

};

var urlArray = JSON.parse(data);

// this returns an Unexpected Identifier error

// const results = await Promise.all(urlArray.map(scrapePage));

// this returns an Unexpected Token Function error

// async function scrapePages(){

// const results = await Promise.all(urlArray.map(scrapePage));

// };

});

scraper.js

const request = require('request'); 

const cheerio = require('cheerio');

exports.scrapePage = (url) => {

return new Promise((resolve, reject) => {

request(url, (error, resp, body) => {

if(error){

reject(error);

};

let $ = cheerio.load(body);

let $url = url;

let $price = $('#rt-mainbody > div > div.details > div.itemData > div:nth-child(4) > div.description').text();

let obj = {

url: $url,

price: $price

}

resolve(obj);

})

})

}

回答:

看起来像你对我没有等待你的承诺,以解决您发送服务器响应之前。您可以使用例如async/await例如完全消除for循环。

const results = await Promise.all(arrayOfURLs.map(scrapePage)); 

回答:

如果您希望不超过x个活动连接数,您可以使用throttle。或者,如果您希望每秒不超过x数量,则可以使用throttlePeriod。

使用Promise.all绝不会叫你的决心处理程序,如果只有一个请求失败,所以你可以捕捉任何错误,并返回一个失败对象

const Fail = function(details){this.details=details;}; 

const max10 = throttle(10)(scrapePage);//max 10 active connections

//const fivePerSecond = throttlePeriod(2,1000)(scrapePage); //start no more than 2 per second

Promise.all(

arrayOfURLs.map(

url =>

max10(url)

.catch(err=>new Fail([err,url]))

)

)

.then(

results =>{

successes =

results.filter(

result=>(result&&result.constructor)!==Fail

);

failed =

results.filter(

result=>(result&&result.constructor)===Fail

)

}

);

回答:

const cheerio = require('cheerio'); 

const request = require('request');

let scrapePage = (url) => {

return new Promise((resolve, reject) => {

request(url, (error, resp, body) => {

if(error){

reject(error);

return;

};

if(!body) {

reject('Empty Body');

return;

}

let $ = cheerio.load(body);

let $url = url;

let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text();

let obj = {

url: $url,

price: $price

}

resolve(obj);

});

});

};

function processUrl(url){

scrapePage(url)

.then((obj) => {

//write to a file

if(i < arrayOfURLs.length)

processUrl(arrayOfURLs.pop())

})

.catch((error) => {

arrayOfURLs.unshift(url);

if(i < arrayOfURLs.length) // put this in finally block

processUrl(arrayOfURLs.pop())

})

};

processUrl(arrayOfURLs.pop());

这里我们可以使用arrayOfUrls数组作为队列,如果我们收到了一个错误或空白页面,我们再次将此URL放入数组中。这样我们就可以以同步的方式处理每个URL。

以上是 如何在Javascript中添加服务器端延迟for循环? 的全部内容, 来源链接: utcz.com/qa/260405.html

回到顶部